# 1. Getting the clean data

### First, we scrape the Wiki page with help of Pandas

In [2]:
import pandas as pd
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Let's drop rows where Borough is not assigned

In [3]:
df = df[df.Borough != 'Not assigned']
df = df.reset_index(drop=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Let's check whether we have 'Not assigned' neighbourhoods

In [4]:
NA_Neighbourhoods = df['Neighbourhood'].str.contains('Not assigned').any()
print(NA_Neighbourhoods)

False


### Let's count the number of rows of the dataframe

In [5]:
df.shape

(103, 3)

# 2. Adding geospatial data

### Unable to import Geocoder package I decided to download the csv file

In [6]:
!wget -q -O 'Geospatial_Data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


### Getting the dataframe out of the csv

In [7]:
df_lon_lat = pd.read_csv('Geospatial_Data.csv')
df_lon_lat.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Now let's merge two dataframes

In [8]:
df_lon_lat.columns=['Postal Code','Latitude','Longitude']
T_df = pd.merge(df,
                 df_lon_lat[['Postal Code','Latitude', 'Longitude']],
                 on='Postal Code')
T_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# 3. Clustering the neighborhoods in Toronto

### Let's count boroughs and neighborhoods in our dataframe

In [9]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(T_df['Borough'].unique()),
        T_df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


### Let's leave only boroughs that contain the word Toronto now

In [10]:
toronto_data = T_df[T_df['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Let's count boroughs and neighborhoods in the new dataframe

In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_data['Borough'].unique()),
        toronto_data.shape[0]
    )
)

The dataframe has 4 boroughs and 39 neighborhoods.


### Importing Nominatim to convert an address into latitude and longitude values

In [12]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |           1_llvm           5 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    _pytorch_select-0.2        |            gpu_0           2 KB
    absl-py-0.11.0          

libcurl-7.71.1       | 312 KB    | ##################################### | 100% 
pandocfilters-1.4.2  | 9 KB      | ##################################### | 100% 
notebook-6.2.0       | 6.2 MB    | ##################################### | 100% 
wheel-0.36.2         | 31 KB     | ##################################### | 100% 
webencodings-0.5.1   | 12 KB     | ##################################### | 100% 
libevent-2.1.10      | 1.1 MB    | ##################################### | 100% 
decorator-4.4.2      | 11 KB     | ##################################### | 100% 
tqdm-4.57.0          | 76 KB     | ##################################### | 100% 
rsa-4.7.1            | 28 KB     | ##################################### | 100% 
beautifulsoup4-4.9.3 | 86 KB     | ##################################### | 100% 
geopy-2.1.0          | 64 KB     | ##################################### | 100% 
argon2-cffi-20.1.0   | 47 KB     | ##################################### | 100% 
ld_impl_linux-64-2.3 | 618 K

future-0.18.2        | 714 KB    | ##################################### | 100% 
patsy-0.5.1          | 187 KB    | ##################################### | 100% 
pyshp-2.1.0          | 31 KB     | ##################################### | 100% 
backports.functools_ | 8 KB      | ##################################### | 100% 
botocore-1.20.13     | 4.5 MB    | ##################################### | 100% 
typing_extensions-3. | 25 KB     | ##################################### | 100% 
wcwidth-0.2.5        | 33 KB     | ##################################### | 100% 
scikit-image-0.18.1  | 11.5 MB   | ##################################### | 100% 
pyqtchart-5.12       | 256 KB    | ##################################### | 100% 
jupyterlab_widgets-1 | 130 KB    | ##################################### | 100% 
libgcc-ng-9.3.0      | 7.8 MB    | ##################################### | 100% 
ibm-wsrt-py37main-ke | 2 KB      | ##################################### | 100% 
aiohttp-3.7.3        | 629 K

jinja2-2.11.3        | 93 KB     | ##################################### | 100% 
pcre-8.44            | 261 KB    | ##################################### | 100% 
cffi-1.14.5          | 225 KB    | ##################################### | 100% 
mock-4.0.3           | 51 KB     | ##################################### | 100% 
boto3-1.17.13        | 70 KB     | ##################################### | 100% 
requests-2.25.1      | 51 KB     | ##################################### | 100% 
liblapack-3.9.0      | 11 KB     | ##################################### | 100% 
freetype-2.10.4      | 890 KB    | ##################################### | 100% 
keras-preprocessing- | 34 KB     | ##################################### | 100% 
libdeflate-1.7       | 67 KB     | ##################################### | 100% 
mpc-1.1.0            | 105 KB    | ##################################### | 100% 
libxcb-1.13          | 395 KB    | ##################################### | 100% 
google-auth-oauthlib | 19 KB

done


### Let's get the geographical coordinates of Manhattan.

In [15]:
address = 'Toronto City, Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.65238435, -79.38356765.


### Installing Folium

In [17]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         713 KB

The following NEW packages will be INSTALLED:

  altair             co

### Creating the map of Toronto City

In [24]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Defining Foursquare Credentials and Version

In [14]:
CLIENT_ID = 'HVVJ1P2ABN2JDRBNHJ5PMAIVOEFJTRLSUZAUCWBDQDSQ3N0B'
CLIENT_SECRET = 'M0CP4OUBV4RRSQXA0ZT2ZDLBZKNBAN0G15HWXD1FTKTF2UKI'
VERSION = '20210222'
LIMIT = 100

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: HVVJ1P2ABN2JDRBNHJ5PMAIVOEFJTRLSUZAUCWBDQDSQ3N0B
CLIENT_SECRET:M0CP4OUBV4RRSQXA0ZT2ZDLBZKNBAN0G15HWXD1FTKTF2UKI


### Let's start exploring the first neighborhoods in the dataframe and get their names first

In [26]:
toronto_data.loc[0, 'Neighbourhood']

'Regent Park, Harbourfront'

### Getting their latitude and longitude values

In [28]:
neighbourhoods_latitude = toronto_data.loc[0, 'Latitude']
neighbourhoods_longitude = toronto_data.loc[0, 'Longitude']

neighbourhoods_names = toronto_data.loc[0, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhoods_names, 
                                                               neighbourhoods_latitude, 
                                                               neighbourhoods_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


### Getting an url for the request to find top 100 venues in the first neighbourhoods

In [29]:
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhoods_latitude, 
    neighbourhoods_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=HVVJ1P2ABN2JDRBNHJ5PMAIVOEFJTRLSUZAUCWBDQDSQ3N0B&client_secret=M0CP4OUBV4RRSQXA0ZT2ZDLBZKNBAN0G15HWXD1FTKTF2UKI&v=20210222&ll=43.6542599,-79.3606359&radius=500&limit=100'

### Importing libraries to send requests and handle JSON files

In [32]:
import requests
import json

### Sending GET reguest and saving data

In [33]:
results = requests.get(url).json()

### Creating function that extracts the category of the venue

In [55]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Cleaning the json and structuring it into a pandas dataframe

In [62]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

AttributeError: 'Series' object has no attribute '_mgr'