In [1]:
import sys
!{sys.executable} -m pip install pandas requests lxml html5lib tqdm



Import packages

In [2]:
import requests
import re
import numpy as np
from tqdm import tqdm
import pandas as pd
import html5lib
from xml.etree import ElementTree

1. We should collect all neighbourhoods of Moscow, Russia. At this tep we can get all postal codesfrom http://mosopen.ru/streets/post_codes_list. But we must pre-process data to have unique postal codes of regions.

Download **Moscow** postal codes data from site

In [3]:
moscow_postcodes_link='http://mosopen.ru/streets/post_codes_list'
raw_moscow_postcodes_page=requests.get(moscow_postcodes_link)
moscow_postcodes_page=raw_moscow_postcodes_page.text

Find start and end of HTML table inside page

In [4]:
page_table_start = moscow_postcodes_page.find('<table class="table_list">')
page_table_end = moscow_postcodes_page.find('</table>', page_table_start)
print(page_table_start, page_table_end)

17716 66404


Parse HTML table from page to variable

In [5]:
moscow_postcodes_page_table = moscow_postcodes_page[page_table_start : page_table_end]

Read HTML table to pandas DataFrame, rename columns and transform data type from _int_ to _str_

In [6]:
moscow_postcodes_df = pd.read_html(moscow_postcodes_page_table, header =0)[0]
# Rename columns
moscow_postcodes_df.columns = ['city_code', 'postal_code']
# Transform city_code to string
moscow_postcodes_df['city_code']= moscow_postcodes_df['city_code'].astype(str)

Split joined postcodes in column to rows

In [7]:
postcodes = moscow_postcodes_df['postal_code'].str.split(',', expand=True).stack().str.strip().reset_index(level=1, drop=True)
postcodes = postcodes.rename('postal_code')

Drop old column with joined postcodes

In [8]:
moscow_postcodes_df = moscow_postcodes_df.drop(['postal_code'], axis=1)

Merge city codes with postal codes

In [9]:
moscow_postcodes_df = pd.concat([moscow_postcodes_df, postcodes], axis=1)

Join city code with postal code into one column

In [10]:
moscow_postcodes_df['postal_code'] = moscow_postcodes_df['city_code'] + moscow_postcodes_df['postal_code']

Drop unused city code

In [11]:
moscow_postcodes_df = moscow_postcodes_df.drop(['city_code'], axis=1)

Convert **DataFrame** to **Series** and print result post codes list

In [12]:
moscow_postcodes = moscow_postcodes_df['postal_code']
moscow_postcodes.head()

0    101000
1    103070
1    103132
1    103274
2    105005
Name: postal_code, dtype: object

2. After that we have list of all Moscow postal codes and can get them coordinates from Openstreetmap with Nominatim API. URL must view like https://nominatim.openstreetmap.org/search?format=xml&city=Moscow&postalcode=117623

Create functiun to get **latitude** and **longitude** from **postal** code in Moscow
<br>URL configured to get information in _JSON_ format, only from _Moscow_.
<br>URL request return list of coordinates. Find only request on post code.

In [13]:
def get_postal_code_location(postal_code):
    # Inti null, if request in empty
    latitude = ''
    longitude = ''
    # generate link for postal code
    link_template='https://nominatim.openstreetmap.org/search?format=json&city=Moscow&postalcode={}'.format(postal_code)
    # get link request in JSON
    raw_json=requests.get(link_template).json()
    
    # Itterate all elements in JSON
    for element in raw_json:
        # Get only postcodes information instead city
        if element['type'] == 'postcode':
            # extract latitude and longitude
            latitude = element['lat']
            longitude = element['lon']
    
    return latitude, longitude    

Test function on different postal codes

In [14]:
print('129110', get_postal_code_location('129110'))
print('117209', get_postal_code_location('117209'))

129110 ('55.7852052906741', '37.6345827572714')
117209 ('55.6624249494477', '37.5769160148382')


Apply function to all postal codes and insert informtion into DataFrame

In [15]:
# Init DataFrame
postcodes_loc_df = pd.DataFrame(columns=['postal_code', 'latitude', 'longitude'])

for postal_code in tqdm(moscow_postcodes):
    # Get latitude and longitude from function
    latitude, longitude = get_postal_code_location(postal_code)
    # Add data to DataFrame
    postcodes_loc_df = postcodes_loc_df.append({'postal_code' : postal_code , 'latitude' : latitude, 'longitude': longitude} , ignore_index=True)

print('Done')

 65%|██████▌   | 305/468 [02:34<01:23,  1.95it/s]

KeyboardInterrupt: 

Drop empty rows

In [None]:
postcodes_loc_df = postcodes_loc_df[postcodes_loc_df['latitude'] != '']

Print head

In [None]:
postcodes_loc_df.head()

3. After collecting all coordinates of postal codes in Moscow to pandas DataFrame we can get infromation about most populat venues from Foursquare.

Define Foursquare Credentials and Version¶

In [None]:
CLIENT_ID = 'Y3FU1X0JU1Z2YKUNLTUCCWH5K0DMDA0AZQAWWMFM4I3XIOZI' # your Foursquare ID
CLIENT_SECRET = '0G02XW3JUA4XIBNNBB5N1G4QUXC1MY3QZGT21DGJITJ2YYY3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
print('LIMIT = ', LIMIT)
print('radius = ', radius)

Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total=len(names)):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *manhattan_venues*.

In [None]:
moscow_venues = getNearbyVenues(postcodes_loc_df['postal_code'], postcodes_loc_df['latitude'], postcodes_loc_df['longitude'])

In [None]:
print(moscow_venues.shape)
moscow_venues.head()

In [None]:
moscow_venues.groupby('Neighborhood').count()

#### Let's find out how many unique categories can be curated from all the returned venues

In [None]:
print('There are {} uniques categories.'.format(len(moscow_venues['Venue Category'].unique())))

## 3. Analyze Each Neighborhood

In [None]:
# one hot encoding
moscow_onehot = pd.get_dummies(moscow_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
moscow_onehot['Neighborhood'] = moscow_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [moscow_onehot.columns[-1]] + list(moscow_onehot.columns[:-1])
moscow_onehot = moscow_onehot[fixed_columns]

moscow_onehot.head()

And let's examine the new dataframe size.

In [None]:
moscow_onehot.shape

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
moscow_grouped = moscow_onehot.groupby('Neighborhood').mean().reset_index()
moscow_grouped