The battle of neighborhoods

1. Gathering data

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium

print('Libraries imported.')


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

   

In [2]:
!wget -q -O '5tiy-yfrg.json' https://data.cityofchicago.org/resource/5tiy-yfrg.json
#print('Data downloaded!')

with open('5tiy-yfrg.json') as json_data:
    chicago_data = json.load(json_data)
#chicago_data
neighborhoods_data = chicago_data

#neighborhoods_data[0]
#community_area_name
#x_coordinate': '1161152.12',
#  'y_coordinate': '1829643.669',
#  'zip_code': '60655

# define the dataframe columns
column_names = ['Borough', 'Latitude', 'Longitude', 'PostalCodes'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = data['community_area_name'].lower().title()
            
    neighborhood_lat = data['latitude']
    neighborhood_lon= data['longitude']
    neig_zip_code=data['zip_code']
    #neighborhood_lat = neighborhood_latlon[1]
    #neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,                                          
                                          'Latitude': float(neighborhood_lat),
                                          'Longitude': float(neighborhood_lon),
                                          'PostalCodes': neig_zip_code}, ignore_index=True)



neighborhoods.shape

#https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago

df1=neighborhoods.drop_duplicates(subset=['Borough'])

df1.head()


Unnamed: 0,Borough,Latitude,Longitude,PostalCodes
0,Morgan Park,41.688218,-87.685663,60655
1,Dunning,41.942154,-87.776506,60634
2,Austin,41.885205,-87.763212,60644
3,South Lawndale,41.852691,-87.696278,60623
4,Belmont Cragin,41.934966,-87.770165,60634


In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
df=pd.DataFrame()

website_url= requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago').text

Treat wikipedia text with BeautifulSoup:

In [4]:
soup = BeautifulSoup(website_url,'lxml')

Find the necessary table using BeautifulSoup:

In [5]:
My_table = soup.find('table',{'class':'wikitable sortable'})


Finally, scrape the website finding all the td elements within the tr elements.
Then, by using a loop, append records to newly created empty lists - each list per each wikipedia's table column .


In [6]:
links = My_table.findAll('a')
Neighbourhood=[]
Borough=[]

#for link in My_table.find_all('a'):
 #   tds =link.get('title').replace(', Chicago','')
    #print(link.get('title').replace(', Chicago',''))
  #  Neighbourhood.append(tds)
    
links = My_table.findAll('tr')

for tr in links[1:]:
    tds = tr.find_all('td')
    #print(tds[1].text)
    Borough.append(tds[1].text.rstrip("\n"))
    Neighbourhood.append(tds[0].text.rstrip("\n"))

df['Borough']=Borough
df['Neighborhood']=Neighbourhood
df.head()

Unnamed: 0,Borough,Neighborhood
0,Albany Park,Albany Park
1,Riverdale,Altgeld Gardens
2,Edgewater,Andersonville
3,Archer Heights,Archer Heights
4,Armour Square,Armour Square




Print the number of rows of your dataframe:

In [7]:
df2=pd.merge(df, df1, on='Borough')
df2.head()
result=pd.DataFrame()
result = df2.groupby(['PostalCodes','Borough', 'Latitude', 'Longitude'], sort=True).agg(', '.join)

result = result.reset_index()
result

Unnamed: 0,PostalCodes,Borough,Latitude,Longitude,Neighborhood
0,60605,Near South Side,41.866856,-87.628651,"Central Station, Dearborn Park, Museum Campus,..."
1,60607,Near West Side,41.874327,-87.660976,"Fulton River District, Greektown, Illinois Med..."
2,60608,Lower West Side,41.852673,-87.663769,"East Pilsen, Heart of Chicago, Lower West Side..."
3,60609,Bridgeport,41.82999,-87.641066,Bridgeport
4,60609,Fuller Park,41.801699,-87.632974,Fuller Park
5,60609,New City,41.795649,-87.642615,"Back of the Yards, Canaryville, New City"
6,60610,Near North Side,41.906573,-87.635514,"Cabrini–Green, Gold Coast, Goose Island, Magni..."
7,60612,East Garfield Park,41.870912,-87.699887,"East Garfield Park, Fifth City"
8,60613,Uptown,41.959978,-87.649712,"Buena Park, Clarendon Park, Margate Park, New ..."
9,60614,Lincoln Park,41.924497,-87.644522,"Lincoln Park, Old Town Triangle, Park West, Ra..."


-----------------------------------
Starting to explore and cluster the neighborhoods in Chicago:

In [8]:
# Get the geographical coordinates of Toronto
address = 'Chicago, IL, USA'

geolocator = Nominatim(user_agent="traveler")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Chicago, IL, USA are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Chicago, IL, USA are 41.8755616, -87.6244212.


In [9]:
# create map of Chicago using latitude and longitude values
chicago = folium.Map(location=[latitude, longitude], zoom_start=10)


# add markers to map
for lat, lng, borough, neighborhood in zip(result['Latitude'], result['Longitude'], result['Borough'], result['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(chicago)  
    
chicago

 let's simplify the above map and segment and cluster only the neighborhoods in Downtown Toronto. 
 So let's slice the original dataframe and create a new dataframe of the Downtown data.

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

Define Foursquare Credentials and Version:

In [10]:
CLIENT_ID = 'KKQW0B4HIZYMF3FXNFI1NK2WGUC33FE34E3S00PBKVODZMTU' # your Foursquare ID
CLIENT_SECRET = '0G5FN0C12BJAWOVNUGCZRSBSP4JQUMC4GEG1NN1HXV5CSFT2' # your Foursquare Secret
VERSION = '20190527' # Foursquare API version
LIMIT = 300
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: KKQW0B4HIZYMF3FXNFI1NK2WGUC33FE34E3S00PBKVODZMTU
CLIENT_SECRET:0G5FN0C12BJAWOVNUGCZRSBSP4JQUMC4GEG1NN1HXV5CSFT2


----------------
Create a function to return the unique categories for each neighbourhood.

In [11]:
def exploreNbhd(names, latitudes, longitudes, radius=500):
    
    venues=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat,
            lng,
            #'52e81612bcbc57f1066b7a04', 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
               
        
        # return only relevant information for each nearby venue
        venues.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
      
    
    nearby_venues = pd.DataFrame([item for venue in venues for item in venue])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

       
    return(nearby_venues)

Get venues for each of the neighbourhood

In [12]:
venues = exploreNbhd(names=result['Neighborhood'],
                                   latitudes=result['Latitude'],
                                   longitudes=result['Longitude']
                                  )

venues.head(5)
#venues.loc[venues['Neighborhood'].isin(['Armour Square, Chinatown, Wentworth Gardens'])].head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Central Station, Dearborn Park, Museum Campus,...",41.866856,-87.628651,Stan's Donuts & Coffee,41.867516,-87.626402,Coffee Shop
1,"Central Station, Dearborn Park, Museum Campus,...",41.866856,-87.628651,Tejas Yoga,41.865584,-87.626084,Yoga Studio
2,"Central Station, Dearborn Park, Museum Campus,...",41.866856,-87.628651,Eleven City Diner,41.86845,-87.626118,Diner
3,"Central Station, Dearborn Park, Museum Campus,...",41.866856,-87.628651,Kriser's Natural Pet,41.869137,-87.627229,Pet Service
4,"Central Station, Dearborn Park, Museum Campus,...",41.866856,-87.628651,Five Guys,41.867697,-87.625852,Burger Joint


Calc the ratios:

In [13]:
# Drop redundant columns
venues_cat = venues.drop(['Venue','Venue Longitude','Venue Latitude'],axis=1,inplace=False)

# Calculate the total count of venues for each Neighbourhood
venues_total = venues_cat.groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).count().rename(columns={'Venue Category':'Total Count'}).reset_index()


# Rename all kinds of Restaurants to have the same name 'Restaurant'
venues_cat.loc[venues_cat['Venue Category'].str.contains("Restaurant") , 'Venue Category'] = 'Restaurant'
venues_cat.loc[venues_cat['Venue Category'].str.contains("Grocery") , 'Venue Category'] = 'Grocery'
#venues_cat.head()
#venues_cat.loc[(venues_cat['Venue Category Id'].isin(["5293a7563cf9994f4e043a44", "52e81612bcbc57f1066b7a04", '52f2ae52bcbc57f1066b8b81', '56aa371be4b08b9a8d57355a', 
#'52e928d0bcbc57f1066b7e96', '4bf58dd8d48988d109941735', '52e928d0bcbc57f1066b7e97', '58daa1558bbb0b01f18ec1ee', 
#'56aa371be4b08b9a8d5734f3', '52960bac3cf9994f4e043ac4', '52e928d0bcbc57f1066b7e98','52e81612bcbc57f1066b7a02', '52e81612bcbc57f1066b7a01', '5293a7d53cf9994f4e043a45', '5744ccdfe4b0c0459246b4d0']))].head()


#venues_cat.loc[venues_cat['Neighborhood'].isin(['Armour Square, Chinatown, Wentworth Gardens'])].head()

# Caculate the count of Asian restaurant for each Neighbourhood
venues_restaurant = venues_cat[venues_cat['Venue Category'].isin(['Restaurant'])]
venues_restaurant_total = venues_restaurant.groupby('Neighborhood').count().rename(columns={'Venue Category':'Restaurants Count'}).reset_index().drop(columns=['Neighborhood Latitude','Neighborhood Longitude'])

venues_shopping = venues_cat[venues_cat['Venue Category'].isin(['Grocery'])]
venues_shopping_total = venues_shopping.groupby('Neighborhood').count().rename(columns={'Venue Category':'Grocery Count'}).reset_index().drop(columns=['Neighborhood Latitude','Neighborhood Longitude'])

# Left join the two dataframes to calculate the ratio of restaurants for each neighbourhood
venues_r = venues_total.merge(venues_restaurant_total,how='left',on='Neighborhood').fillna(0)
venues_s= venues_r.merge(venues_shopping_total,how='left',on='Neighborhood').fillna(0)

venues_s['Ratio'] = venues_s['Restaurants Count']/venues_s['Total Count']
venues_s['Ratio_grocery'] = venues_s['Grocery Count']/venues_s['Total Count']

# Drop redundant columns
venues_s.drop(['Total Count'],axis=1,inplace=True) #,'Restaurants Count'
venues_s.head()
#venues_shopping.head()
#venues_cat['Venue Category']

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Restaurants Count,Grocery Count,Ratio,Ratio_grocery
0,"Albany Park, Mayfair, North Mayfair, Ravenswoo...",41.971143,-87.709627,9.0,2.0,0.28125,0.0625
1,"Altgeld Gardens, Eden Green, Golden Gate, Rive...",41.657432,-87.606496,0.0,1.0,0.0,0.25
2,"Andersonville, Edgewater, Edgewater Beach, Edg...",41.976216,-87.669762,15.0,1.0,0.180723,0.012048
3,Archer Heights,41.803046,-87.722007,3.0,0.0,0.15,0.0
4,"Armour Square, Chinatown, Wentworth Gardens",41.843952,-87.635318,4.0,0.0,0.333333,0.0


Analyze Each Neighborhood
Get the top 5 most common venues in each neighborhood

In [14]:
# one hot encoding
chicago_onehot = pd.get_dummies(venues_cat[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
chicago_onehot['Neighborhood'] = venues_cat['Neighborhood'] 

#Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
chicago_grouped = chicago_onehot.groupby('Neighborhood').mean().reset_index()
chicago_grouped.head()

Unnamed: 0,Neighborhood,ATM,Adult Boutique,Airport,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Athletics & Sports,Auto Dealership,Automotive Shop,BBQ Joint,Bakery,Bank,Bar,Baseball Field,Basketball Court,Beach,Beer Bar,Beer Garden,Beer Store,Big Box Store,Bike Rental / Bike Share,Bookstore,Boutique,Bowling Alley,Boxing Gym,Breakfast Spot,Brewery,Building,Burger Joint,Burrito Place,Bus Line,Bus Station,Business Service,Café,Check Cashing Service,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Stadium,Comedy Club,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Costume Shop,Coworking Space,Creperie,Cupcake Shop,Currency Exchange,Cycle Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Distillery,Dive Bar,Doctor's Office,Dog Run,Donut Shop,Elementary School,Event Space,Farmers Market,Field,Financial or Legal Service,Fish & Chips Shop,Flower Shop,Food,Food & Drink Shop,Food Truck,Football Stadium,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gay Bar,Gift Shop,Golf Course,Gourmet Shop,Grocery,Gym,Gym / Fitness Center,Harbor / Marina,Hardware Store,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hobby Shop,Home Service,Hookah Bar,Hostel,Hot Dog Joint,Hotel,Ice Cream Shop,Indie Theater,Intersection,Irish Pub,Jewelry Store,Juice Bar,Kids Store,Laundromat,Light Rail Station,Lingerie Store,Liquor Store,Locksmith,Lounge,Market,Martial Arts Dojo,Massage Studio,Men's Store,Metro Station,Miscellaneous Shop,Mobile Phone Shop,Monument / Landmark,Motorcycle Shop,Movie Theater,Moving Target,Multiplex,Museum,Music Venue,Nail Salon,Nightclub,Nightlife Spot,Optical Shop,Other Repair Shop,Paper / Office Supplies Store,Park,Performing Arts Venue,Pet Service,Pet Store,Pharmacy,Pie Shop,Pizza Place,Playground,Plaza,Poke Place,Pool,Pub,Public Art,Record Shop,Rental Car Location,Restaurant,Rock Club,Roof Deck,Salad Place,Salon / Barbershop,Sandwich Place,Science Museum,Sculpture Garden,Shipping Store,Shoe Store,Shop & Service,Shopping Mall,Shopping Plaza,Smoke Shop,Snack Place,Soccer Field,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Steakhouse,Storage Facility,Supermarket,Supplement Shop,Taco Place,Tanning Salon,Tattoo Parlor,Tennis Court,Theater,Thrift / Vintage Store,Toy / Game Store,Track,Train Station,Video Game Store,Video Store,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Albany Park, Mayfair, North Mayfair, Ravenswoo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28125,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Altgeld Gardens, Eden Green, Golden Gate, Rive...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Andersonville, Edgewater, Edgewater Beach, Edg...",0.0,0.012048,0.0,0.024096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.012048,0.0,0.0,0.0,0.024096,0.0,0.0,0.0,0.012048,0.024096,0.0,0.0,0.0,0.024096,0.012048,0.0,0.024096,0.0,0.0,0.012048,0.0,0.012048,0.0,0.0,0.0,0.024096,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.012048,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024096,0.024096,0.0,0.0,0.0,0.0,0.012048,0.012048,0.0,0.0,0.0,0.012048,0.012048,0.0,0.0,0.012048,0.012048,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.012048,0.012048,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.024096,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,0.024096,0.0,0.012048,0.024096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180723,0.0,0.0,0.0,0.024096,0.036145,0.0,0.012048,0.0,0.012048,0.0,0.0,0.0,0.0,0.012048,0.0,0.024096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.012048,0.0,0.0,0.0,0.0,0.024096,0.0,0.012048,0.0,0.012048,0.0,0.0,0.0
3,Archer Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.15,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Armour Square, Chinatown, Wentworth Gardens",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Sort the venues in descending order. First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [16]:
# Create the new dataframe and display the top 5 venues for each neighborhood.
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = chicago_grouped['Neighborhood']

for ind in np.arange(chicago_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(chicago_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Albany Park, Mayfair, North Mayfair, Ravenswoo...",Restaurant,Ice Cream Shop,Grocery,Park,Bank
1,"Altgeld Gardens, Eden Green, Golden Gate, Rive...",Food,Grocery,Park,Clothing Store,Yoga Studio
2,"Andersonville, Edgewater, Edgewater Beach, Edg...",Restaurant,Sandwich Place,Pet Store,Salon / Barbershop,Bookstore
3,Archer Heights,Restaurant,Pizza Place,Discount Store,Sandwich Place,Cosmetics Shop
4,"Armour Square, Chinatown, Wentworth Gardens",Restaurant,Pizza Place,Park,Breakfast Spot,Flower Shop


merge

In [17]:
venues_merged = venues_s.merge(neighborhoods_venues_sorted,how='left',on='Neighborhood').set_index('Neighborhood',inplace=False)
venues_merged.head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Restaurants Count,Grocery Count,Ratio,Ratio_grocery,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Albany Park, Mayfair, North Mayfair, Ravenswood Manor",41.971143,-87.709627,9.0,2.0,0.28125,0.0625,Restaurant,Ice Cream Shop,Grocery,Park,Bank
"Altgeld Gardens, Eden Green, Golden Gate, Riverdale",41.657432,-87.606496,0.0,1.0,0.0,0.25,Food,Grocery,Park,Clothing Store,Yoga Studio
"Andersonville, Edgewater, Edgewater Beach, Edgewater Glen, Lakewood / Balmoral",41.976216,-87.669762,15.0,1.0,0.180723,0.012048,Restaurant,Sandwich Place,Pet Store,Salon / Barbershop,Bookstore
Archer Heights,41.803046,-87.722007,3.0,0.0,0.15,0.0,Restaurant,Pizza Place,Discount Store,Sandwich Place,Cosmetics Shop
"Armour Square, Chinatown, Wentworth Gardens",41.843952,-87.635318,4.0,0.0,0.333333,0.0,Restaurant,Pizza Place,Park,Breakfast Spot,Flower Shop


4. Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [18]:
# Get the unique category names
cat_unique=venues_cat['Venue Category'].unique()
# Create a dictionary for the mapping
dictionary = dict(zip(list(cat_unique), list(range(len(cat_unique)))))

# Mapping names of venue category into corresponding numbers
venues_clst=venues_merged.drop(columns=['Neighborhood Latitude','Neighborhood Longitude'],axis=1,inplace=False)
venues_clst['1st Most Common Venue'] = venues_clst['1st Most Common Venue'].replace(dictionary)
venues_clst['2nd Most Common Venue'] = venues_clst['2nd Most Common Venue'].replace(dictionary)
venues_clst['3rd Most Common Venue'] = venues_clst['3rd Most Common Venue'].replace(dictionary)
venues_clst['4th Most Common Venue'] = venues_clst['4th Most Common Venue'].replace(dictionary)
venues_clst['5th Most Common Venue'] = venues_clst['5th Most Common Venue'].replace(dictionary)

venues_clst.head()


Unnamed: 0_level_0,Restaurants Count,Grocery Count,Ratio,Ratio_grocery,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"Albany Park, Mayfair, North Mayfair, Ravenswood Manor",9.0,2.0,0.28125,0.0625,7,36,6,19,66
"Altgeld Gardens, Eden Green, Golden Gate, Riverdale",0.0,1.0,0.0,0.25,148,6,19,25,1
"Andersonville, Edgewater, Edgewater Beach, Edgewater Glen, Lakewood / Balmoral",15.0,1.0,0.180723,0.012048,7,24,63,20,100
Archer Heights,3.0,0.0,0.15,0.0,7,5,68,24,29
"Armour Square, Chinatown, Wentworth Gardens",4.0,0.0,0.333333,0.0,7,5,19,14,60


In [19]:
# set number of clusters
kclusters = 10

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_clst)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 8, 0, 6, 0, 7, 7, 3, 9, 3], dtype=int32)


Showing Centers of Each Cluster

In [20]:
# add clustering labels
venues_merged.insert(0, 'Cluster Labels', kmeans.labels_)

venues_merged['factor'] =   venues_merged['Restaurants Count'] * venues_merged['Ratio'] +  venues_merged['Grocery Count']*venues_merged['Ratio_grocery']

venues_merged.sort_values(axis = 0, by = ['factor'], ascending=False, inplace=True)

venues_merged.reset_index(inplace=True)
venues_merged

Unnamed: 0,Neighborhood,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Restaurants Count,Grocery Count,Ratio,Ratio_grocery,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,factor
0,"Lincoln Park, Old Town Triangle, Park West, Ra...",0,41.924497,-87.644522,27.0,1.0,0.27,0.01,Restaurant,Bar,Sandwich Place,Coffee Shop,Pizza Place,7.3
1,"East Village, Noble Square, Pulaski Park, Rive...",3,41.899581,-87.681902,20.0,2.0,0.289855,0.028986,Restaurant,Bakery,Dive Bar,Salon / Barbershop,Pub,5.855072
2,"Belmont Central, Brickyard, Cragin, Hanson Park",7,41.934966,-87.770165,13.0,1.0,0.351351,0.027027,Restaurant,Discount Store,Pizza Place,ATM,Bakery,4.594595
3,"Boystown, Graceland West, Lake View, Lake View...",0,41.93412,-87.656192,12.0,0.0,0.25,0.0,Restaurant,Bar,Pizza Place,Sports Bar,Pub,3.0
4,"Kenwood, North Kenwood",3,41.803755,-87.590384,13.0,1.0,0.220339,0.016949,Restaurant,Cosmetics Shop,Hotel,Pizza Place,Sandwich Place,2.881356
5,West Garfield Park,6,41.878772,-87.724649,7.0,1.0,0.388889,0.055556,Restaurant,Clothing Store,Shoe Store,Intersection,Sandwich Place,2.777778
6,"Andersonville, Edgewater, Edgewater Beach, Edg...",0,41.976216,-87.669762,15.0,1.0,0.180723,0.012048,Restaurant,Sandwich Place,Pet Store,Salon / Barbershop,Bookstore,2.722892
7,"Avondale, Jackowo, Wacławowo",3,41.940019,-87.702598,8.0,1.0,0.333333,0.041667,Restaurant,Food Truck,Gaming Cafe,Bar,Burger Joint,2.708333
8,"Albany Park, Mayfair, North Mayfair, Ravenswoo...",0,41.971143,-87.709627,9.0,2.0,0.28125,0.0625,Restaurant,Ice Cream Shop,Grocery,Park,Bank,2.65625
9,"Little Village, Marshall Square, South Lawndale",7,41.852691,-87.696278,7.0,0.0,0.368421,0.0,Restaurant,Bike Rental / Bike Share,Taco Place,Discount Store,Bakery,2.578947


In [30]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venues_merged['Neighborhood Latitude'], venues_merged['Neighborhood Longitude'], venues_merged['Neighborhood'], venues_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Results:

Best Group is 0 (red);
Second Best Group is 3 (ocean blue);
Third Best Group is 7 (olive);