In [77]:
import sys
!{sys.executable} -m pip install pandas requests lxml html5lib tqdm openpyxl xlrd sklearn geopy folium matplotlib

Collecting matplotlib
[?25l  Downloading https://files.pythonhosted.org/packages/71/07/16d781df15be30df4acfd536c479268f1208b2dfbc91e9ca5d92c9caf673/matplotlib-3.0.2-cp36-cp36m-manylinux1_x86_64.whl (12.9MB)
[K    100% |████████████████████████████████| 12.9MB 1.4MB/s eta 0:00:01
Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib)
[?25l  Downloading https://files.pythonhosted.org/packages/de/0a/001be530836743d8be6c2d85069f46fecf84ac6c18c7f5fb8125ee11d854/pyparsing-2.3.1-py2.py3-none-any.whl (61kB)
[K    100% |████████████████████████████████| 71kB 25.4MB/s ta 0:00:01
[?25hCollecting kiwisolver>=1.0.1 (from matplotlib)
[?25l  Downloading https://files.pythonhosted.org/packages/69/a7/88719d132b18300b4369fbffa741841cfd36d1e637e1990f27929945b538/kiwisolver-1.0.1-cp36-cp36m-manylinux1_x86_64.whl (949kB)
[K    100% |████████████████████████████████| 952kB 12.3MB/s ta 0:00:01
[?25hCollecting cycler>=0.10 (from matplotlib)
  Downloading https://files.pythonhosted.org/p

Import packages

In [78]:
import requests
import re
import numpy as np
from tqdm import tqdm
import pandas as pd
import html5lib
from xml.etree import ElementTree
import os
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

import matplotlib.cm as cm
import matplotlib.colors as colors

1. We should collect all neighbourhoods of Moscow, Russia. At this tep we can get all postal codesfrom http://mosopen.ru/streets/post_codes_list. But we must pre-process data to have unique postal codes of regions.

Download **Moscow** postal codes data from site

In [3]:
moscow_postcodes_link='http://mosopen.ru/streets/post_codes_list'
raw_moscow_postcodes_page=requests.get(moscow_postcodes_link)
moscow_postcodes_page=raw_moscow_postcodes_page.text

Find start and end of HTML table inside page

In [4]:
page_table_start = moscow_postcodes_page.find('<table class="table_list">')
page_table_end = moscow_postcodes_page.find('</table>', page_table_start)
print(page_table_start, page_table_end)

17716 66404


Parse HTML table from page to variable

In [5]:
moscow_postcodes_page_table = moscow_postcodes_page[page_table_start : page_table_end]

Read HTML table to pandas DataFrame, rename columns and transform data type from _int_ to _str_

In [6]:
moscow_postcodes_df = pd.read_html(moscow_postcodes_page_table, header =0)[0]
# Rename columns
moscow_postcodes_df.columns = ['city_code', 'postal_code']
# Transform city_code to string
moscow_postcodes_df['city_code']= moscow_postcodes_df['city_code'].astype(str)

Split joined postcodes in column to rows

In [7]:
postcodes = moscow_postcodes_df['postal_code'].str.split(',', expand=True).stack().str.strip().reset_index(level=1, drop=True)
postcodes = postcodes.rename('postal_code')

Drop old column with joined postcodes

In [8]:
moscow_postcodes_df = moscow_postcodes_df.drop(['postal_code'], axis=1)

Merge city codes with postal codes

In [9]:
moscow_postcodes_df = pd.concat([moscow_postcodes_df, postcodes], axis=1)

Join city code with postal code into one column

In [10]:
moscow_postcodes_df['postal_code'] = moscow_postcodes_df['city_code'] + moscow_postcodes_df['postal_code']

Drop unused city code

In [11]:
moscow_postcodes_df = moscow_postcodes_df.drop(['city_code'], axis=1)

Convert **DataFrame** to **Series** and print result post codes list

In [12]:
moscow_postcodes = moscow_postcodes_df['postal_code']
moscow_postcodes.head()

0    101000
1    103070
1    103132
1    103274
2    105005
Name: postal_code, dtype: object

2. After that we have list of all Moscow postal codes and can get them coordinates from Openstreetmap with Nominatim API. URL must view like https://nominatim.openstreetmap.org/search?format=xml&city=Moscow&postalcode=117623

Create functiun to get **latitude** and **longitude** from **postal** code in Moscow
<br>URL configured to get information in _JSON_ format, only from _Moscow_.
<br>URL request return list of coordinates. Find only request on post code.

In [13]:
def get_postal_code_location(postal_code):
    # Inti null, if request in empty
    latitude = ''
    longitude = ''
    # generate link for postal code
    link_template='https://nominatim.openstreetmap.org/search?format=json&city=Moscow&postalcode={}'.format(postal_code)
    # get link request in JSON
    raw_json=requests.get(link_template).json()
    
    # Itterate all elements in JSON
    for element in raw_json:
        # Get only postcodes information instead city
        if element['type'] == 'postcode':
            # extract latitude and longitude
            latitude = element['lat']
            longitude = element['lon']
    
    return latitude, longitude    

Test function on different postal codes

In [14]:
print('129110', get_postal_code_location('129110'))
print('117209', get_postal_code_location('117209'))

129110 ('55.7852052906741', '37.6345827572715')
117209 ('55.6624249494477', '37.5769160148382')


Apply function to all postal codes and insert informtion into DataFrame

In [30]:
def create_postcodes_file():
    postcodes_loc_df = pd.DataFrame(columns=['postal_code', 'latitude', 'longitude'])

    for postal_code in tqdm(moscow_postcodes):
        # Get latitude and longitude from function
        latitude, longitude = get_postal_code_location(postal_code)
        # Add data to DataFrame
        postcodes_loc_df = postcodes_loc_df.append({'postal_code' : postal_code , 'latitude' : latitude, 'longitude': longitude} , ignore_index=True)

    # Drop null coordinates
    postcodes_loc_df = postcodes_loc_df[postcodes_loc_df['latitude'] != '']
        
    # Write DataFrame to Excel file to improve performance
    writer = pd.ExcelWriter('Moscow_postalcodes_coordinates.xlsx')
    postcodes_loc_df.to_excel(writer,'main')
    writer.save()
    
    print('Done')
    
    return postcodes_loc_df

If Excel file exist, then load it, else create file with postal codes coordinates

In [87]:
exists = os.path.isfile('Moscow_postalcodes_coordinates.xlsx')
if exists:
    print('File exist. Load.')
    postcodes_loc_df = pd.read_excel('Moscow_postalcodes_coordinates.xlsx')
else:
    print('File not exist. Create.')
    postcodes_loc_df = create_postcodes_file()

File exist. Load.


Print head

In [86]:
postcodes_loc_df.head()

Unnamed: 0,postal_code,latitude,longitude
0,101000,55.760943,37.634282
3,103274,55.752841,37.574511
4,105005,55.767631,37.679706
5,105037,55.793867,37.773689
6,105043,55.792142,37.790378


For optimisation get top 100 postal codes

In [88]:
postcodes_loc_df = postcodes_loc_df.head(300)

3. After collecting all coordinates of postal codes in Moscow to pandas DataFrame we can get infromation about most populat venues from Foursquare.

Define Foursquare Credentials and Version¶

In [44]:
CLIENT_ID = 'Y3FU1X0JU1Z2YKUNLTUCCWH5K0DMDA0AZQAWWMFM4I3XIOZI' # your Foursquare ID
CLIENT_SECRET = '0G02XW3JUA4XIBNNBB5N1G4QUXC1MY3QZGT21DGJITJ2YYY3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
print('LIMIT = ', LIMIT)
print('radius = ', radius)

Your credentails:
CLIENT_ID: Y3FU1X0JU1Z2YKUNLTUCCWH5K0DMDA0AZQAWWMFM4I3XIOZI
CLIENT_SECRET:0G02XW3JUA4XIBNNBB5N1G4QUXC1MY3QZGT21DGJITJ2YYY3
LIMIT =  100
radius =  500


Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [45]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total=len(names)):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
#         print(url)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *manhattan_venues*.

In [89]:
moscow_venues = getNearbyVenues(postcodes_loc_df['postal_code'], postcodes_loc_df['latitude'], postcodes_loc_df['longitude'])




  0%|          | 0/300 [00:00<?, ?it/s][A[A[A


  0%|          | 1/300 [00:00<02:33,  1.95it/s][A[A[A


  1%|          | 2/300 [00:00<02:08,  2.31it/s][A[A[A


  1%|          | 3/300 [00:01<02:15,  2.18it/s][A[A[A


  1%|▏         | 4/300 [00:01<01:57,  2.52it/s][A[A[A


  2%|▏         | 5/300 [00:02<02:07,  2.32it/s][A[A[A


  2%|▏         | 6/300 [00:02<02:30,  1.95it/s][A[A[A


  2%|▏         | 7/300 [00:03<02:26,  2.00it/s][A[A[A


  3%|▎         | 8/300 [00:03<02:20,  2.08it/s][A[A[A


  3%|▎         | 9/300 [00:03<02:00,  2.41it/s][A[A[A


  3%|▎         | 10/300 [00:04<01:48,  2.68it/s][A[A[A


  4%|▎         | 11/300 [00:04<01:45,  2.75it/s][A[A[A


  4%|▍         | 12/300 [00:04<01:37,  2.96it/s][A[A[A


  4%|▍         | 13/300 [00:05<01:41,  2.82it/s][A[A[A


  5%|▍         | 14/300 [00:05<01:34,  3.04it/s][A[A[A


  5%|▌         | 15/300 [00:05<01:42,  2.78it/s][A[A[A


  5%|▌         | 16/300 [00:06<01:35,  2.98it/s][A[A

 45%|████▌     | 136/300 [00:50<00:53,  3.08it/s][A[A[A


 46%|████▌     | 137/300 [00:51<00:51,  3.15it/s][A[A[A


 46%|████▌     | 138/300 [00:51<00:49,  3.27it/s][A[A[A


 46%|████▋     | 139/300 [00:52<01:13,  2.19it/s][A[A[A


 47%|████▋     | 140/300 [00:52<01:04,  2.50it/s][A[A[A


 47%|████▋     | 141/300 [00:52<01:01,  2.60it/s][A[A[A


 47%|████▋     | 142/300 [00:52<00:52,  3.00it/s][A[A[A


 48%|████▊     | 143/300 [00:53<00:59,  2.63it/s][A[A[A


 48%|████▊     | 144/300 [00:53<01:05,  2.38it/s][A[A[A


 48%|████▊     | 145/300 [00:54<00:55,  2.82it/s][A[A[A


 49%|████▊     | 146/300 [00:54<01:06,  2.31it/s][A[A[A


 49%|████▉     | 147/300 [00:55<01:04,  2.39it/s][A[A[A


 49%|████▉     | 148/300 [00:55<00:56,  2.68it/s][A[A[A


 50%|████▉     | 149/300 [00:55<00:49,  3.05it/s][A[A[A


 50%|█████     | 150/300 [00:56<00:48,  3.07it/s][A[A[A


 50%|█████     | 151/300 [00:56<00:47,  3.11it/s][A[A[A


 51%|█████     | 152/300

 90%|█████████ | 270/300 [01:41<00:14,  2.12it/s][A[A[A


 90%|█████████ | 271/300 [01:41<00:13,  2.13it/s][A[A[A


 91%|█████████ | 272/300 [01:41<00:11,  2.42it/s][A[A[A


 91%|█████████ | 273/300 [01:42<00:09,  2.72it/s][A[A[A


 91%|█████████▏| 274/300 [01:42<00:09,  2.70it/s][A[A[A


 92%|█████████▏| 275/300 [01:42<00:08,  2.78it/s][A[A[A


 92%|█████████▏| 276/300 [01:43<00:09,  2.43it/s][A[A[A


 92%|█████████▏| 277/300 [01:43<00:09,  2.30it/s][A[A[A


 93%|█████████▎| 278/300 [01:44<00:08,  2.45it/s][A[A[A


 93%|█████████▎| 279/300 [01:44<00:08,  2.48it/s][A[A[A


 93%|█████████▎| 280/300 [01:44<00:07,  2.78it/s][A[A[A


 94%|█████████▎| 281/300 [01:45<00:06,  2.91it/s][A[A[A


 94%|█████████▍| 282/300 [01:45<00:05,  3.05it/s][A[A[A


 94%|█████████▍| 283/300 [01:45<00:05,  2.97it/s][A[A[A


 95%|█████████▍| 284/300 [01:46<00:06,  2.61it/s][A[A[A


 95%|█████████▌| 285/300 [01:46<00:05,  2.88it/s][A[A[A


 95%|█████████▌| 286/300

In [90]:
print(moscow_venues.shape)
moscow_venues.head()

(6601, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,101000,55.760943,37.634282,Хачапури,55.761583,37.635856,Caucasian Restaurant
1,101000,55.760943,37.634282,Чайных дел мастерская,55.761063,37.635439,Tea Room
2,101000,55.760943,37.634282,Time Club «Убежище»,55.761148,37.637729,Gaming Cafe
3,101000,55.760943,37.634282,Double B Coffee & Tea,55.761527,37.631717,Coffee Shop
4,101000,55.760943,37.634282,Особняк Салтыкова-Черткова,55.760387,37.631293,Historic Site


In [91]:
moscow_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
101000,98,98,98,98,98,98
103274,23,23,23,23,23,23
105005,47,47,47,47,47,47
105037,17,17,17,17,17,17
105043,19,19,19,19,19,19
105062,65,65,65,65,65,65
105064,42,42,42,42,42,42
105066,32,32,32,32,32,32
105077,9,9,9,9,9,9
105082,14,14,14,14,14,14


#### Let's find out how many unique categories can be curated from all the returned venues

In [92]:
print('There are {} uniques categories.'.format(len(moscow_venues['Venue Category'].unique())))

There are 370 uniques categories.


## 3. Analyze Each Neighborhood

In [93]:
# one hot encoding
moscow_onehot = pd.get_dummies(moscow_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
moscow_onehot['Neighborhood'] = moscow_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [moscow_onehot.columns[-1]] + list(moscow_onehot.columns[:-1])
moscow_onehot = moscow_onehot[fixed_columns]

moscow_onehot.head()

Unnamed: 0,Zoo Exhibit,Accessories Store,Adult Boutique,American Restaurant,Amphitheater,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,...,Warehouse Store,Watch Shop,Water Park,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [94]:
moscow_onehot.shape

(6601, 370)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [95]:
moscow_grouped = moscow_onehot.groupby('Neighborhood').mean().reset_index()
moscow_grouped.head()

Unnamed: 0,Neighborhood,Zoo Exhibit,Accessories Store,Adult Boutique,American Restaurant,Amphitheater,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,...,Warehouse Store,Watch Shop,Water Park,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,101000,0.0,0.0,0.010204,0.0,0.0,0.0,0.0,0.020408,0.0,...,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.030612,0.0
1,103274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,105005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,105037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,105043,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's confirm the new size

In [96]:
moscow_grouped.shape

(298, 370)

#### Let's print each neighborhood along with the top 5 most common venues

In [97]:
num_top_venues = 5

for hood in moscow_grouped['Neighborhood']:
    print("----"+str(hood)+"----")
    temp = moscow_grouped[moscow_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----101000----
                  venue  freq
0           Coffee Shop  0.09
1                Hostel  0.05
2           Gaming Cafe  0.04
3  Caucasian Restaurant  0.04
4          Dance Studio  0.04


----103274----
                venue  freq
0  Seafood Restaurant  0.13
1  Italian Restaurant  0.09
2         Karaoke Bar  0.04
3        Gourmet Shop  0.04
4          Steakhouse  0.04


----105005----
                 venue  freq
0          Coffee Shop  0.09
1           Hookah Bar  0.06
2         Dance Studio  0.04
3  Dumpling Restaurant  0.04
4                Hotel  0.02


----105037----
                 venue  freq
0        Grocery Store  0.12
1          Supermarket  0.12
2  Arts & Crafts Store  0.06
3                 Café  0.06
4            Bookstore  0.06


----105043----
                 venue  freq
0  Sporting Goods Shop  0.11
1          Supermarket  0.11
2          Zoo Exhibit  0.05
3               Bakery  0.05
4   Salon / Barbershop  0.05


----105062----
                  venue  freq


               venue  freq
0        Supermarket  0.13
1  Outdoor Sculpture  0.07
2     Farmers Market  0.07
3        Pizza Place  0.07
4           Fountain  0.07


----109004----
                  venue  freq
0  Gym / Fitness Center  0.08
1            Restaurant  0.08
2           Coffee Shop  0.08
3           Karaoke Bar  0.04
4                Bakery  0.04


----109028----
                venue  freq
0         Art Gallery  0.07
1                Café  0.07
2               Plaza  0.07
3         Coffee Shop  0.04
4  Miscellaneous Shop  0.04


----109029----
                   venue  freq
0   Gym / Fitness Center  0.14
1                   Park  0.09
2  Performing Arts Venue  0.09
3                    Gym  0.05
4          Auto Workshop  0.05


----109044----
                           venue  freq
0                    Coffee Shop  0.09
1                Bed & Breakfast  0.05
2  Vegetarian / Vegan Restaurant  0.05
3                           Café  0.05
4               Sushi Restaurant  0.05




               venue  freq
0        Pizza Place  0.08
1               Café  0.08
2              Plaza  0.08
3        Coffee Shop  0.04
4  Convenience Store  0.04


----109548----
               venue  freq
0        Pizza Place  0.13
1     Cosmetics Shop  0.07
2                Gym  0.07
3   Pedestrian Plaza  0.07
4  Health Food Store  0.07


----109559----
                   venue  freq
0             Restaurant   0.1
1  Vietnamese Restaurant   0.1
2            Tailor Shop   0.1
3     Chinese Restaurant   0.1
4      Food & Drink Shop   0.1


----109649----
                 venue  freq
0    Korean Restaurant  0.25
1              Dog Run  0.25
2        Garden Center  0.25
3             Bus Stop  0.25
4  Peruvian Restaurant  0.00


----109651----
          venue  freq
0   Supermarket  0.25
1          Café  0.12
2  Dance Studio  0.12
3          Park  0.12
4      Pharmacy  0.12


----109652----
                  venue  freq
0           Karaoke Bar  0.25
1              Beer Bar  0.25
2        

               venue  freq
0        Supermarket  0.12
1           Bus Stop  0.12
2  Convenience Store  0.06
3               Park  0.06
4           Bus Line  0.06


----115404----
               venue  freq
0        Supermarket   0.2
1  Convenience Store   0.2
2             Bakery   0.2
3           Bus Stop   0.2
4  Health Food Store   0.2


----115407----
               venue  freq
0       Tram Station   0.1
1  Convenience Store   0.1
2           Pharmacy   0.1
3     Farmers Market   0.1
4   Sculpture Garden   0.1


----115408----
            venue  freq
0      Playground  0.75
1   Metro Station  0.25
2     Zoo Exhibit  0.00
3     Planetarium  0.00
4  Pilates Studio  0.00


----115409----
          venue  freq
0          Park  0.17
1   Supermarket  0.08
2      Bus Stop  0.08
3         Hotel  0.08
4  Soccer Field  0.08


----115419----
                  venue  freq
0            Board Shop  0.12
1  Gym / Fitness Center  0.12
2                Bakery  0.06
3          Concert Hall  0.06
4  

               venue  freq
0  Korean Restaurant  0.08
1  Electronics Store  0.08
2          Nightclub  0.08
3        Gas Station  0.08
4          Cafeteria  0.08


----117393----
               venue  freq
0               Café  0.21
1         Restaurant  0.11
2         Playground  0.11
3  Food & Drink Shop  0.05
4  Convenience Store  0.05


----117403----
            venue  freq
0        Bus Stop  0.43
1   Big Box Store  0.29
2        Bus Line  0.14
3  Farmers Market  0.14
4     Zoo Exhibit  0.00


----117405----
                  venue  freq
0  Fast Food Restaurant  0.17
1           Coffee Shop  0.08
2     Electronics Store  0.08
3      Pedestrian Plaza  0.08
4             Pet Store  0.08


----117418----
                 venue  freq
0                 Park  0.15
1          Flower Shop  0.08
2  Japanese Restaurant  0.08
3           Playground  0.08
4       Farmers Market  0.08


----117420----
                       venue  freq
0       Gym / Fitness Center  0.07
1  Middle Eastern Resta

              venue  freq
0       Coffee Shop  0.16
1              Park  0.05
2         Wine Shop  0.05
3          Pharmacy  0.04
4  Sushi Restaurant  0.04


----119180----
                  venue  freq
0             Nightclub  0.07
1           Art Gallery  0.07
2           Coffee Shop  0.07
3  Gym / Fitness Center  0.05
4                  Café  0.05


----119192----
               venue  freq
0               Café  0.06
1             Bakery  0.06
2  Convenience Store  0.06
3       Liquor Store  0.03
4         Hookah Bar  0.03


----119234----
               venue  freq
0     Science Museum  0.17
1              Plaza  0.17
2               Pool  0.17
3               Park  0.17
4  College Bookstore  0.17


----119261----
                  venue  freq
0     Convenience Store  0.16
1            Kids Store  0.10
2        Cosmetics Shop  0.06
3           Supermarket  0.06
4  Gym / Fitness Center  0.06


----119270----
               venue  freq
0  Electronics Store  0.17
1        Supermarket 

            venue  freq
0            Café  0.17
1  Cosmetics Shop  0.10
2             Gym  0.07
3      Playground  0.03
4         Theater  0.03


----121351----
               venue  freq
0  Convenience Store  0.14
1        Pizza Place  0.09
2           Bus Stop  0.09
3  Health Food Store  0.09
4        Zoo Exhibit  0.05


----121352----
                    venue  freq
0    Fast Food Restaurant  0.08
1          Clothing Store  0.08
2       Electronics Store  0.04
3  Furniture / Home Store  0.04
4             Coffee Shop  0.04


----121353----
            venue  freq
0   Auto Workshop  0.67
1       Cafeteria  0.33
2     Zoo Exhibit  0.00
3        Pet Café  0.00
4  Pilates Studio  0.00


----121354----
                venue  freq
0         Supermarket  0.31
1         Golf Course  0.08
2  Italian Restaurant  0.08
3               Hotel  0.08
4      Ice Cream Shop  0.08


----121357----
                       venue  freq
0                        Spa  0.25
1  Middle Eastern Restaurant  0.12


#### Let's put that into a *pandas* dataframe
First, let's write a function to sort the venues in descending order.

In [98]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [99]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = moscow_grouped['Neighborhood']

for ind in np.arange(moscow_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(moscow_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,101000,Coffee Shop,Hostel,Caucasian Restaurant,Gaming Cafe,Dance Studio,Café,Music Venue,Bar,Yoga Studio,Seafood Restaurant
1,103274,Seafood Restaurant,Italian Restaurant,Karaoke Bar,Hotel,Bridal Shop,Sporting Goods Shop,Café,Park,Gourmet Shop,Caucasian Restaurant
2,105005,Coffee Shop,Hookah Bar,Dance Studio,Dumpling Restaurant,Restaurant,Smoke Shop,Beer Bar,Beer Store,Tea Room,Miscellaneous Shop
3,105037,Supermarket,Grocery Store,Department Store,Arts & Crafts Store,Sushi Restaurant,Bookstore,Sporting Goods Shop,Café,Coffee Shop,Plaza
4,105043,Sporting Goods Shop,Supermarket,Zoo Exhibit,Salon / Barbershop,Sushi Restaurant,Soccer Field,Food & Drink Shop,Theater,Shopping Mall,Caucasian Restaurant
5,105062,Coffee Shop,Beer Bar,Café,Theater,Bakery,Gym / Fitness Center,Plaza,Restaurant,Fountain,Health & Beauty Service
6,105064,Coffee Shop,Gym / Fitness Center,Italian Restaurant,Theater,Caucasian Restaurant,Vegetarian / Vegan Restaurant,Greek Restaurant,Basketball Court,Toy / Game Store,Health & Beauty Service
7,105066,Coffee Shop,Gym / Fitness Center,Caucasian Restaurant,Café,Restaurant,Supermarket,Beer Bar,Miscellaneous Shop,Motorcycle Shop,Gun Range
8,105077,French Restaurant,Bed & Breakfast,Garden,Food & Drink Shop,Salon / Barbershop,Bus Stop,Convenience Store,Exhibit,Theater,Football Stadium
9,105082,Dessert Shop,Art Museum,Theater,Bar,Hookah Bar,Tea Room,Tailor Shop,Health Food Store,Cosmetics Shop,Russian Restaurant


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [100]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

moscow_grouped_clustering = moscow_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(moscow_grouped_clustering)

# check cluster labels generated for each row in the dataframe
len(kmeans.labels_)

298

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [131]:
postcodes_loc_df

Unnamed: 0,postal_code,latitude,longitude
0,101000,55.760943,37.634282
3,103274,55.752841,37.574511
4,105005,55.767631,37.679706
5,105037,55.793867,37.773689
6,105043,55.792142,37.790378
7,105062,55.760844,37.650049
8,105064,55.762052,37.659913
9,105066,55.771457,37.669986
10,105077,55.798057,37.812459
11,105082,55.776174,37.686221


In [132]:
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,101000,Coffee Shop,Hostel,Caucasian Restaurant,Gaming Cafe,Dance Studio,Café,Music Venue,Bar,Yoga Studio,Seafood Restaurant,1
1,103274,Seafood Restaurant,Italian Restaurant,Karaoke Bar,Hotel,Bridal Shop,Sporting Goods Shop,Café,Park,Gourmet Shop,Caucasian Restaurant,1
2,105005,Coffee Shop,Hookah Bar,Dance Studio,Dumpling Restaurant,Restaurant,Smoke Shop,Beer Bar,Beer Store,Tea Room,Miscellaneous Shop,1
3,105037,Supermarket,Grocery Store,Department Store,Arts & Crafts Store,Sushi Restaurant,Bookstore,Sporting Goods Shop,Café,Coffee Shop,Plaza,1
4,105043,Sporting Goods Shop,Supermarket,Zoo Exhibit,Salon / Barbershop,Sushi Restaurant,Soccer Field,Food & Drink Shop,Theater,Shopping Mall,Caucasian Restaurant,1


Merge clustering group to neighbourhood

In [133]:
moscow_merged = postcodes_loc_df.merge(neighborhoods_venues_sorted, left_on='postal_code', right_on='Neighborhood', how='left')
moscow_merged = moscow_merged.dropna()
moscow_merged.shape

(298, 15)

Convert **Cluster Labels** to *integer*

In [134]:
moscow_merged['Cluster Labels'] = moscow_merged['Cluster Labels'].astype('int')

#### Use geopy library to get the latitude and longitude values of New York City.

In [136]:
address = 'Moscow, RUS'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Moscow, RUS are 55.7507178, 37.6176606.


Finally, let's visualize the resulting clusters

In [137]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(moscow_merged['latitude'], moscow_merged['longitude'], moscow_merged['Neighborhood'], moscow_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

#### Cluster 1

In [143]:
moscow_merged.loc[moscow_merged['Cluster Labels'] == 0, moscow_merged.columns[[0] + list(range(4, moscow_merged.shape[1] - 1))]]

Unnamed: 0,postal_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,105118,Park,Supermarket,Stadium,Cosmetics Shop,Convenience Store,Fountain,Flea Market,Flower Shop,Food,Food & Drink Shop
14,105173,Supermarket,Convenience Store,Big Box Store,Bus Stop,Trail,Fountain,Flea Market,Flower Shop,Food,Food & Drink Shop
20,105275,Pharmacy,Supermarket,Bed & Breakfast,Food Service,Bus Stop,Convenience Store,Athletics & Sports,Fountain,Food,Food & Drink Shop
26,105568,Supermarket,Gym,Cafeteria,French Restaurant,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service,Food Truck
32,107065,Supermarket,Bakery,Bus Stop,Gym,Pet Store,Skating Rink,Dairy Store,Shop & Service,Diner,Flower Shop
44,107497,Supermarket,Café,Dog Run,Car Wash,Auto Workshop,Bus Stop,Food,Food & Drink Shop,Food Court,Food Service
46,107564,Supermarket,Pizza Place,Bakery,Convenience Store,Football Stadium,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop
54,109117,Supermarket,Pedestrian Plaza,Dog Run,Café,Park,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service
55,109125,Supermarket,Italian Restaurant,Soccer Field,Convenience Store,Auto Workshop,Zoo,Football Stadium,Flea Market,Flower Shop,Food
57,109145,Supermarket,Grocery Store,Massage Studio,Outdoor Gym,Big Box Store,Hardware Store,Mobile Phone Shop,Flower Shop,Toy / Game Store,Train Station


#### Cluster 2

In [144]:
moscow_merged.loc[moscow_merged['Cluster Labels'] == 1, moscow_merged.columns[[0] + list(range(4, moscow_merged.shape[1] - 1))]]

Unnamed: 0,postal_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,101000,Coffee Shop,Hostel,Caucasian Restaurant,Gaming Cafe,Dance Studio,Café,Music Venue,Bar,Yoga Studio,Seafood Restaurant
1,103274,Seafood Restaurant,Italian Restaurant,Karaoke Bar,Hotel,Bridal Shop,Sporting Goods Shop,Café,Park,Gourmet Shop,Caucasian Restaurant
2,105005,Coffee Shop,Hookah Bar,Dance Studio,Dumpling Restaurant,Restaurant,Smoke Shop,Beer Bar,Beer Store,Tea Room,Miscellaneous Shop
3,105037,Supermarket,Grocery Store,Department Store,Arts & Crafts Store,Sushi Restaurant,Bookstore,Sporting Goods Shop,Café,Coffee Shop,Plaza
4,105043,Sporting Goods Shop,Supermarket,Zoo Exhibit,Salon / Barbershop,Sushi Restaurant,Soccer Field,Food & Drink Shop,Theater,Shopping Mall,Caucasian Restaurant
5,105062,Coffee Shop,Beer Bar,Café,Theater,Bakery,Gym / Fitness Center,Plaza,Restaurant,Fountain,Health & Beauty Service
6,105064,Coffee Shop,Gym / Fitness Center,Italian Restaurant,Theater,Caucasian Restaurant,Vegetarian / Vegan Restaurant,Greek Restaurant,Basketball Court,Toy / Game Store,Health & Beauty Service
7,105066,Coffee Shop,Gym / Fitness Center,Caucasian Restaurant,Café,Restaurant,Supermarket,Beer Bar,Miscellaneous Shop,Motorcycle Shop,Gun Range
8,105077,French Restaurant,Bed & Breakfast,Garden,Food & Drink Shop,Salon / Barbershop,Bus Stop,Convenience Store,Exhibit,Theater,Football Stadium
9,105082,Dessert Shop,Art Museum,Theater,Bar,Hookah Bar,Tea Room,Tailor Shop,Health Food Store,Cosmetics Shop,Russian Restaurant


#### Cluster 3

In [146]:
moscow_merged.loc[moscow_merged['Cluster Labels'] == 2, moscow_merged.columns[[0] + list(range(4, moscow_merged.shape[1] - 1))]]

Unnamed: 0,postal_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
78,109429,Bus Stop,Bus Line,Cafeteria,Zoo,French Restaurant,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service
95,109649,Bus Stop,Korean Restaurant,Garden Center,Dog Run,French Restaurant,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service
106,111395,Bus Stop,Supermarket,Lake,Food Truck,Gym / Fitness Center,Trail,Garden,Football Stadium,Flea Market,Flower Shop
119,111622,Department Store,Lake,Auto Workshop,Bus Stop,Skate Park,Garden,Garden Center,Fish Market,Flea Market,Flower Shop
136,115211,Bus Stop,Café,Laser Tag,Pier,Zoo,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court
156,115547,Bus Stop,Supermarket,Bar,Food Service,Gym / Fitness Center,Fountain,Flea Market,Flower Shop,Food,Food & Drink Shop
188,117403,Bus Stop,Big Box Store,Farmers Market,Bus Line,Fried Chicken Joint,Food,Food & Drink Shop,Food Court,Food Service,Food Truck
197,117463,Bus Stop,Hockey Arena,Convenience Store,Toy / Game Store,Pizza Place,Farm,Supermarket,Japanese Restaurant,Fountain,Flea Market
209,117574,Bus Stop,Gym Pool,Gym,Supermarket,Skating Rink,Gaming Cafe,Food Truck,Fish Market,Flea Market,Flower Shop
240,119297,Bus Stop,Food & Drink Shop,Train Station,French Restaurant,Flea Market,Flower Shop,Food,Food Court,Food Service,Food Truck


#### Cluster 4

In [147]:
moscow_merged.loc[moscow_merged['Cluster Labels'] == 3, moscow_merged.columns[[0] + list(range(4, moscow_merged.shape[1] - 1))]]

Unnamed: 0,postal_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
71,109382,Auto Workshop,Photography Studio,Zoo,Financial or Legal Service,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service
133,115191,Auto Workshop,Boat or Ferry,Zoo,Fountain,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service
135,115201,Auto Workshop,Zoo,Fountain,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service,Food Truck
262,119619,Cafeteria,Auto Workshop,Financial or Legal Service,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service,Food Truck
279,121353,Auto Workshop,Cafeteria,Financial or Legal Service,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service,Food Truck


#### Cluster 5

In [148]:
moscow_merged.loc[moscow_merged['Cluster Labels'] == 4, moscow_merged.columns[[0] + list(range(4, moscow_merged.shape[1] - 1))]]

Unnamed: 0,postal_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
39,107207,Café,Sushi Restaurant,Bookstore,Cosmetics Shop,Convenience Store,Zoo,Fountain,Flower Shop,Food,Food & Drink Shop
40,107241,Café,Supermarket,Bookstore,Cosmetics Shop,Car Wash,Bus Stop,Restaurant,Intersection,Garden Center,Financial or Legal Service
72,109383,Skating Rink,Gym / Fitness Center,Convenience Store,Café,Football Stadium,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop
75,109388,Skating Rink,Convenience Store,Café,Football Stadium,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court
175,117198,Café,Nightclub,Bakery,Breakfast Spot,Szechuan Restaurant,Soccer Field,Grocery Store,Tennis Court,Pedestrian Plaza,Gym Pool
180,117246,Café,Restaurant,Tennis Court,Flower Shop,Candy Store,Gym / Fitness Center,Gym,Public Art,Skate Park,Bus Stop
187,117393,Café,Restaurant,Playground,Trail,Grocery Store,Theater,Food & Drink Shop,Gym / Fitness Center,Historic Site,Convenience Store
224,119027,Café,Restaurant,Zoo,Fountain,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Service
261,119618,BBQ Joint,Auto Workshop,Athletics & Sports,Café,Zoo,Fountain,Flower Shop,Food,Food & Drink Shop,Food Court
299,123098,Dance Studio,Café,Park,Sporting Goods Shop,Department Store,Dive Bar,French Restaurant,Food & Drink Shop,Food Court,Food Service
