In [1]:
import sys
!{sys.executable} -m pip install pandas requests lxml html5lib tqdm

Collecting pandas
  Using cached https://files.pythonhosted.org/packages/e1/d8/feeb346d41f181e83fba45224ab14a8d8af019b48af742e047f3845d8cff/pandas-0.23.4-cp36-cp36m-manylinux1_x86_64.whl
Collecting lxml
  Using cached https://files.pythonhosted.org/packages/5d/d4/e81be10be160a6323cf5f29f1eabc9693080cb16780a2e19c96091ee37ee/lxml-4.3.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting html5lib
  Using cached https://files.pythonhosted.org/packages/a5/62/bbd2be0e7943ec8504b517e62bab011b4946e1258842bc159e5dfde15b96/html5lib-1.0.1-py2.py3-none-any.whl
Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/d1/f9/8cbd36ef8bf84c5281e4943eaa12fe34850a0e8204e44872d8ca0c0ec741/tqdm-4.29.0-py2.py3-none-any.whl (46kB)
[K    100% |████████████████████████████████| 51kB 2.9MB/s ta 0:00:01
[?25hCollecting pytz>=2011k (from pandas)
  Using cached https://files.pythonhosted.org/packages/61/28/1d3920e4d1d50b19bc5d24398a7cd85cc7b9a75a490570d5a30c57622d34/pytz-2018.9-py2.py3-none-any.whl
C

Import packages

In [2]:
import requests
import re
import numpy as np
from tqdm import tqdm
import pandas as pd
import html5lib
from xml.etree import ElementTree

1. We should collect all neighbourhoods of Moscow, Russia. At this tep we can get all postal codesfrom http://mosopen.ru/streets/post_codes_list. But we must pre-process data to have unique postal codes of regions.

Download **Moscow** postal codes data from site

In [3]:
moscow_postcodes_link='http://mosopen.ru/streets/post_codes_list'
raw_moscow_postcodes_page=requests.get(moscow_postcodes_link)
moscow_postcodes_page=raw_moscow_postcodes_page.text

Find start and end of HTML table inside page

In [4]:
page_table_start = moscow_postcodes_page.find('<table class="table_list">')
page_table_end = moscow_postcodes_page.find('</table>', page_table_start)
print(page_table_start, page_table_end)

17716 66404


Parse HTML table from page to variable

In [5]:
moscow_postcodes_page_table = moscow_postcodes_page[page_table_start : page_table_end]

Read HTML table to pandas DataFrame, rename columns and transform data type from _int_ to _str_

In [6]:
moscow_postcodes_df = pd.read_html(moscow_postcodes_page_table, header =0)[0]
# Rename columns
moscow_postcodes_df.columns = ['city_code', 'postal_code']
# Transform city_code to string
moscow_postcodes_df['city_code']= moscow_postcodes_df['city_code'].astype(str)

Split joined postcodes in column to rows

In [7]:
postcodes = moscow_postcodes_df['postal_code'].str.split(',', expand=True).stack().str.strip().reset_index(level=1, drop=True)
postcodes = postcodes.rename('postal_code')

Drop old column with joined postcodes

In [8]:
moscow_postcodes_df = moscow_postcodes_df.drop(['postal_code'], axis=1)

Merge city codes with postal codes

In [9]:
moscow_postcodes_df = pd.concat([moscow_postcodes_df, postcodes], axis=1)

Join city code with postal code into one column

In [10]:
moscow_postcodes_df['postal_code'] = moscow_postcodes_df['city_code'] + moscow_postcodes_df['postal_code']

Drop unused city code

In [11]:
moscow_postcodes_df = moscow_postcodes_df.drop(['city_code'], axis=1)

Convert **DataFrame** to **Series** and print result post codes list

In [12]:
moscow_postcodes = moscow_postcodes_df['postal_code']
moscow_postcodes.head()

0    101000
1    103070
1    103132
1    103274
2    105005
Name: postal_code, dtype: object

2. After that we have list of all Moscow postal codes and can get them coordinates from Openstreetmap with Nominatim API. URL must view like https://nominatim.openstreetmap.org/search?format=xml&city=Moscow&postalcode=117623

Create functiun to get **latitude** and **longitude** from **postal** code in Moscow
<br>URL configured to get information in _JSON_ format, only from _Moscow_.
<br>URL request return list of coordinates. Find only request on post code.

In [13]:
def get_postal_code_location(postal_code):
    # Inti null, if request in empty
    latitude = ''
    longitude = ''
    # generate link for postal code
    link_template='https://nominatim.openstreetmap.org/search?format=json&city=Moscow&postalcode={}'.format(postal_code)
    # get link request in JSON
    raw_json=requests.get(link_template).json()
    
    # Itterate all elements in JSON
    for element in raw_json:
        # Get only postcodes information instead city
        if element['type'] == 'postcode':
            # extract latitude and longitude
            latitude = element['lat']
            longitude = element['lon']
    
    return latitude, longitude    

Test function on different postal codes

In [14]:
print('129110', get_postal_code_location('129110'))
print('117209', get_postal_code_location('117209'))

129110 ('55.7852052906741', '37.6345827572715')
117209 ('55.6624249494477', '37.5769160148383')


Apply function to all postal codes and insert informtion into DataFrame

In [15]:
# Init DataFrame
postcodes_loc_df = pd.DataFrame(columns=['postal_code', 'latitude', 'longitude'])

for postal_code in tqdm(moscow_postcodes):
    # Get latitude and longitude from function
    latitude, longitude = get_postal_code_location(postal_code)
    # Add data to DataFrame
    postcodes_loc_df = postcodes_loc_df.append({'postal_code' : postal_code , 'latitude' : latitude, 'longitude': longitude} , ignore_index=True)

print('Done')

100%|██████████| 468/468 [04:07<00:00,  1.88it/s]

Done





Drop empty rows

In [16]:
postcodes_loc_df = postcodes_loc_df[postcodes_loc_df['latitude'] != '']

Print head

In [17]:
postcodes_loc_df.head()

Unnamed: 0,postal_code,latitude,longitude
0,101000,55.7608778083614,37.6342604732896
3,103274,55.7528410152297,37.574510795379
4,105005,55.7676310026104,37.6797064802202
5,105037,55.7938879994846,37.7736807317296
6,105043,55.7921423068318,37.7903780983711


3. After collecting all coordinates of postal codes in Moscow to pandas DataFrame we can get infromation about most populat venues from Foursquare.

Define Foursquare Credentials and Version¶

In [18]:
CLIENT_ID = 'Y3FU1X0JU1Z2YKUNLTUCCWH5K0DMDA0AZQAWWMFM4I3XIOZI' # your Foursquare ID
CLIENT_SECRET = '0G02XW3JUA4XIBNNBB5N1G4QUXC1MY3QZGT21DGJITJ2YYY3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
print('LIMIT = ', LIMIT)
print('radius = ', radius)

Your credentails:
CLIENT_ID: Y3FU1X0JU1Z2YKUNLTUCCWH5K0DMDA0AZQAWWMFM4I3XIOZI
CLIENT_SECRET:0G02XW3JUA4XIBNNBB5N1G4QUXC1MY3QZGT21DGJITJ2YYY3
LIMIT =  100
radius =  500


Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total=len(names)):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        print(url)
        print(requests.get(url).json()["response"])
        
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *manhattan_venues*.

In [26]:
moscow_venues = getNearbyVenues(postcodes_loc_df['postal_code'], postcodes_loc_df['latitude'], postcodes_loc_df['longitude'])

  0%|          | 0/446 [00:00<?, ?it/s]

https://api.foursquare.com/v2/venues/explore?&client_id=Y3FU1X0JU1Z2YKUNLTUCCWH5K0DMDA0AZQAWWMFM4I3XIOZI&client_secret=0G02XW3JUA4XIBNNBB5N1G4QUXC1MY3QZGT21DGJITJ2YYY3&v=20180605&ll=55.7608778083614,37.6342604732896&radius=500&limit=100
{}





KeyError: 'groups'

In [None]:
print(moscow_venues.shape)
moscow_venues.head()

In [None]:
moscow_venues.groupby('Neighborhood').count()

#### Let's find out how many unique categories can be curated from all the returned venues

In [None]:
print('There are {} uniques categories.'.format(len(moscow_venues['Venue Category'].unique())))

## 3. Analyze Each Neighborhood

In [None]:
# one hot encoding
moscow_onehot = pd.get_dummies(moscow_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
moscow_onehot['Neighborhood'] = moscow_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [moscow_onehot.columns[-1]] + list(moscow_onehot.columns[:-1])
moscow_onehot = moscow_onehot[fixed_columns]

moscow_onehot.head()

And let's examine the new dataframe size.

In [None]:
moscow_onehot.shape

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
moscow_grouped = moscow_onehot.groupby('Neighborhood').mean().reset_index()
moscow_grouped

#### Let's confirm the new size

In [None]:
toronto_grouped.shape

#### Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Let's put that into a *pandas* dataframe
First, let's write a function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [None]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
len(kmeans.labels_)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
toronto_grouped['Cluster Labels'] = kmeans.labels_
toronto_grouped

Merge clustering group to neighbourhood

In [None]:
toronto_merged = toronto_data.merge(toronto_grouped, left_on='Neighbourhood', right_on='Neighborhood', how='left')
toronto_merged = toronto_merged.dropna()
toronto_merged.shape

Convert **Cluster Labels** to *integer*

In [None]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int')

In [None]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

#### Use geopy library to get the latitude and longitude values of New York City.

In [None]:
address = 'Toronto, CAN'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

Finally, let's visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters