# Toronto Housing prices to the venues around them

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup # to scrape the housing file
from selenium import webdriver

import urllib

print('Libraries imported.')

# Data Preperation

Collecting the data of the house prices in the Toronto community 

In [12]:
source = requests.get('https://toronto.listing.ca/real-estate-prices-by-community.htm').text
soup = BeautifulSoup(source, 'lxml')
toronto = soup.find('div', class_='mt30')
housing = {}
neighborhood_name_list = []

for neighborhood in toronto.find_all('div', class_='nd m5'):
    neighborhood_name = neighborhood.div.text
    housing[neighborhood_name] = {}
    housing_list = []

    for properties in neighborhood.find_all(['rc','r1']):
        property_name = properties.text
        property_name = property_name.replace('$', '\n')
        property_name = property_name.replace(',', '')
        property_name = property_name.split('\n')
        housing_list += property_name

    for i in range(len(housing_list)):
        if 'Condo' in housing_list[i] or 'Townhome' in housing_list[i] or 'Home' in housing_list[i]:
            property_type = housing_list[i]
            housing[neighborhood_name][property_type] = {}
            neighborhood_name_list += [neighborhood_name]
        elif 'Studio' in housing_list[i] or 'Bedroom' in housing_list[i]:
            housing[neighborhood_name][property_type][housing_list[i]] = housing_list[i+1]

Making a list for each column from the dictionary 

In [13]:
rows_house = []
rows_bed = []
rows_bed1 = []
rows_bed2 = []
rows_bed3 = []
bedroom1 = '1 Bedroom'
bedroom2 = '2 Bedrooms'
bedroom3 = '3 Bedrooms'

for data in housing:
    for house_type in housing[data]:
        rows_house += [house_type]
        new_bed = []
        all_beds = 0
        for bed_type in housing[data][house_type]:
            if bed_type == bedroom1 or bed_type == bedroom2 or bed_type == bedroom3:
                new_bed += [bed_type, int(housing[data][house_type][bed_type])]
        if bedroom1 not in new_bed:
            new_bed += [bedroom1, None]
        if bedroom2 not in new_bed:
            new_bed += [bedroom2, None]
        if bedroom3 not in new_bed:
            new_bed += [bedroom3, None]
        rows_bed += [new_bed]        

for i in range (len(rows_bed)):
    for j in range (0,len(rows_bed[i]),2):
        if rows_bed[i][j] == bedroom1:
            rows_bed1 += [rows_bed[i][j+1]]
        elif rows_bed[i][j] == bedroom2:
            rows_bed2 += [rows_bed[i][j+1]]
        elif rows_bed[i][j] == bedroom3:
            rows_bed3 += [rows_bed[i][j+1]]

Creating a Dataframe

In [14]:
houses = pd.DataFrame()
houses['Neighborhood'] = neighborhood_name_list
houses['House Type'] = rows_house
# houses['1 Bedroom'] = rows_bed1
houses['2 Bedrooms'] = rows_bed2
houses['3 Bedrooms'] = rows_bed3
houses = houses.dropna()

areas_list = (list(housing.keys()))
houses.head()

Unnamed: 0,Neighborhood,House Type,2 Bedrooms,3 Bedrooms
0,Agincourt North,Condos,433756.0,455588.0
1,Agincourt North,Condo Townhomes,415745.0,612993.0
2,Agincourt North,Freehold Townhomes,674552.0,759873.0
4,Agincourt South-Malvern West,Condos,474412.0,425634.0
5,Agincourt South-Malvern West,Condo Townhomes,501112.0,685167.0


Importing the location data and creating the dataframe

In [15]:
toronto_neighborhood = pd.read_csv('Neighbourhoods.csv')
toronto_neighborhood

column_names = ['Neighborhood', 'Area Code', 'Latitude', 'Longitude'] 
location = pd.DataFrame(columns=column_names)
location['Neighborhood'] = toronto_neighborhood['AREA_NAME'].str.rstrip('(1234567890)')
location['Neighborhood'] = location['Neighborhood'].str.rstrip()
location['Area Code'] = toronto_neighborhood['AREA_SHORT_CODE']
location['Latitude'] = toronto_neighborhood['LATITUDE']
location['Longitude'] = toronto_neighborhood['LONGITUDE']
# location = location.sort_values(by=['Area Code']).reset_index()
location = location.sort_values(by=['Neighborhood'])
print (location.shape)
location.head()

(140, 4)


Unnamed: 0,Neighborhood,Area Code,Latitude,Longitude
74,Agincourt North,129,43.805441,-79.266712
75,Agincourt South-Malvern West,128,43.788658,-79.265612
76,Alderwood,20,43.604937,-79.541611
77,Annex,95,43.671585,-79.404001
78,Banbury-Don Mills,42,43.737657,-79.349718


### Cleaning the houses dataframe, so that the Neighborhood column of the houses and location dataframe has the same neighborhoods

In [16]:
equality = pd.DataFrame(location['Neighborhood'])
neighborhood_col = equality['Neighborhood'].tolist()
equality = equality.append(pd.DataFrame({'Neighborhood': ['a','b', 'c']}))
equality['Neighborhood_house'] = areas_list
equality = equality.reset_index(drop=True)
equality.head()

Unnamed: 0,Neighborhood,Neighborhood_house
0,Agincourt North,Agincourt North
1,Agincourt South-Malvern West,Agincourt South-Malvern West
2,Alderwood,Alderwood
3,Annex,Annex
4,Banbury-Don Mills,Banbury-Don Mills


Finding the values not in both columns, and the neighborhoods with similar names

In [17]:
length = 139
missing = []
deleted_areas_list = []
deleted_neighborhood_col = []
new_areas_list = areas_list.copy()
new_neighborhood_col = neighborhood_col.copy()
for i in range(133):
    deleted = 0
    if new_areas_list[i] != new_neighborhood_col[i]:
        for j in range(10):
            try:
                if new_neighborhood_col[i] == new_areas_list[i+j]:
                    print ('Deleted from areas list at', new_areas_list[i], i)
                    deleted_areas_list += [new_areas_list[i]]
                    del new_areas_list[i]
                    length -= 1
                    deleted = 1
                    # break
            except:
                print ('Deleted from areas list at', new_areas_list[i], i)
                deleted_areas_list += [new_areas_list[i]]
                del new_areas_list[i]
                length -= 1
                deleted = 1

            try:
                if new_neighborhood_col[i+j] == new_areas_list[i]:
                    print ('Deleted from neighborhood column at', new_neighborhood_col[i], i)
                    deleted_neighborhood_col += [new_neighborhood_col[i]]
                    del new_neighborhood_col[i]
                    length -= 1
                    deleted = 1
                    # break

            except:
                print ('Deleted from neighborhood column at', new_neighborhood_col[i], i)
                deleted_neighborhood_col += [new_neighborhood_col[i]]
                del new_neighborhood_col[i]
                length -= 1
                deleted = 1
                # break

        if deleted != 1:
            missing += [new_areas_list[i], new_neighborhood_col[i]]
            print ('Conflicted at', i)

print (deleted_areas_list)
print (deleted_neighborhood_col)

Conflicted at 19
Deleted from areas list at Crescent Town 28
Conflicted at 29
Deleted from areas list at East York 36
Conflicted at 53
Deleted from areas list at Humberlea-Pelmo Park W4 55
Deleted from areas list at Humbermede 56
Conflicted at 70
Conflicted at 77
Conflicted at 89
Conflicted at 92
Deleted from neighborhood column at Old East York 93
Deleted from neighborhood column at Pelmo Park-Humberlea 95
Conflicted at 103
Deleted from areas list at Rouge E11 104
Conflicted at 109
Deleted from neighborhood column at Taylor-Massey 113
Conflicted at 114
Conflicted at 119
Deleted from areas list at Waterfront Communities C8 120
Conflicted at 125
['Crescent Town', 'East York', 'Humberlea-Pelmo Park W4', 'Humbermede', 'Rouge E11', 'Waterfront Communities C8']
['Old East York', 'Pelmo Park-Humberlea', 'Taylor-Massey']


Extracting the prices accociated with the neighborhood house type and taking the mean of two neighborhoods when both are the same

In [18]:
start_with_list = ['Humberlea', 'Rouge', 'Waterfront Communities C']
drop_with_list = ['Humberlea-Pelmo Park W5', 'Humberlea-Pelmo Park W4', 'Rouge E10', 'Rouge E11', 'Waterfront Communities C1', 'Waterfront Communities C8']
consolidated_list = []
replacing_areas = []
replaced = []

for data in start_with_list:
    s = houses['Neighborhood'].str.startswith(data)
    temp_df = houses[s]
    i = 0
    replacing_areas += [houses[s].values.tolist()]
    group_type = temp_df.groupby('House Type').groups
    temp_df = temp_df.groupby('House Type').mean()
    consolidated_list += [list(group_type.keys()), temp_df.values.tolist()]

print (consolidated_list)
for i in range (0, len(consolidated_list), 2):
    replaced_temp = []
    for j in range (0, len(consolidated_list[i])):
        replaced_temp += [[replacing_areas[int(i/2)][j][0], consolidated_list[i][j], consolidated_list[i+1][j][0], consolidated_list[i+1][j][1]]]
    replaced += [replaced_temp]

[['Condo Townhomes', 'Condos', 'Detached Homes', 'Freehold Townhomes'], [[516813.0, 608613.0], [383564.0, 445335.0], [784021.5, 853258.5], [724073.0, 762889.0]], ['Condo Townhomes', 'Condos', 'Detached Homes', 'Freehold Townhomes'], [[453941.0, 574959.0], [440787.0, 494246.0], [995428.0, 820396.5], [592559.5, 738665.5]], ['Condo Townhomes', 'Condos', 'Freehold Townhomes'], [[1178989.5, 1664523.5], [880582.5, 1061512.5], [1299093.5, 1304075.0]]]


In [None]:
# for i in range(len(areas_list) - 3):
#     if areas_list[i] == 'Crescent Town':
#         areas_list[i] = 'Taylor-Massey'
#     if areas_list[i] == 'East York':
#         areas_list[i] = 'Old East York'
#     if areas_list[i] == 'Humberlea-Pelmo Park W5':
#         areas_list[i] = 'Pelmo Park-Humberlea'
#     if areas_list[i] == 'Humberlea-Pelmo Park W4':
#         del areas_list[i]
#     if areas_list[i] == 'Rouge E10':
#         areas_list[i] = 'Rouge'
#     if areas_list[i] == 'Rouge E11':
#         del areas_list[i]
#     if areas_list[i] == 'Waterfront Communities C1':
#         areas_list[i] = 'Waterfront Communities-The Island'
#     if areas_list[i] == 'Waterfront Communities C8':
#         del areas_list[i]
# # areas_list

Updating the replaced values in the houses dataframe

In [19]:
real_areas = ['Pelmo Park-Humberlea', 'Rouge', 'Waterfront Communities-The Island']

i = 0
for data in start_with_list:
    s = houses['Neighborhood'].str.startswith(data)
    temp_df = houses[s]
    for j in range(len(replaced[i])):
        temp_df = temp_df.replace({'Neighborhood': replacing_areas[i][j]}, {'Neighborhood': real_areas[i]})
        temp_df.loc[temp_df['House Type'] == replaced[i][j][1], ['2 Bedrooms', '3 Bedrooms']] = [replaced[i][j][2], replaced[i][j][3]]
    temp_df = temp_df.drop_duplicates()

    #because the 'Waterfront Communities C8' doesn't change to 'Waterfront Communities-The Island'
    temp_df = temp_df[temp_df['Neighborhood'] != 'Waterfront Communities C8'] 
    houses = houses.append(temp_df)
    i += 1

for data in drop_with_list:
    houses = houses[houses['Neighborhood'] != data]

houses = houses.replace({'Crescent Town': 'Taylor-Massey', 'East York': 'Old East York'})

houses = houses.sort_values(by=['Neighborhood', 'House Type'])
houses = houses.reset_index(drop = True)

j = 0
for i in range(len(missing)):
    if missing[i-j] in drop_with_list:
        del missing[i-j+1]
        del missing[i-j]
        j += 2

for i in range(0, len(missing), 2):
    houses = houses.replace({missing[i]: missing[i+1]})
houses.head()

Unnamed: 0,Neighborhood,House Type,2 Bedrooms,3 Bedrooms
0,Agincourt North,Condo Townhomes,415745.0,612993.0
1,Agincourt North,Condos,433756.0,455588.0
2,Agincourt North,Freehold Townhomes,674552.0,759873.0
3,Agincourt South-Malvern West,Condo Townhomes,501112.0,685167.0
4,Agincourt South-Malvern West,Condos,474412.0,425634.0


Merging both the dataframes

In [20]:
houses = pd.merge(houses, location, on='Neighborhood')
print (houses.shape)
houses.head()

(454, 7)


Unnamed: 0,Neighborhood,House Type,2 Bedrooms,3 Bedrooms,Area Code,Latitude,Longitude
0,Agincourt North,Condo Townhomes,415745.0,612993.0,129,43.805441,-79.266712
1,Agincourt North,Condos,433756.0,455588.0,129,43.805441,-79.266712
2,Agincourt North,Freehold Townhomes,674552.0,759873.0,129,43.805441,-79.266712
3,Agincourt South-Malvern West,Condo Townhomes,501112.0,685167.0,128,43.788658,-79.265612
4,Agincourt South-Malvern West,Condos,474412.0,425634.0,128,43.788658,-79.265612


Getting the location of Toronto

In [21]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Visualizing the map of Toronto and its Neighborhoods

In [22]:
toronto_geo = r'Neighbourhoods.geojson'
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

toronto_map.choropleth(
    geo_data=toronto_geo,
    data=houses,
    columns=['House Type', '2 Bedrooms', '3 Bedrooms'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Housing Prices in Toronto'
)
# for lat, lng, bedroom2, bedroom3, neighborhood in zip(houses['Latitude'], houses['Longitude'], houses['2 Bedrooms'], houses['3 Bedrooms'], houses['Neighborhood']):
#     label = '{}\n, {}\n, {}'.format(bedroom2, bedroom3, neighborhood)
#     label = folium.Popup(label, parse_html=True)
#     folium.Marker(
#         [lat, lng],
#         popup=label,
#     ).add_to(toronto_map)
toronto_map

In [23]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, bedroom2, bedroom3, neighborhood in zip(houses['Latitude'], houses['Longitude'], houses['2 Bedrooms'], houses['3 Bedrooms'], houses['Neighborhood']):
    label = '{}\n, {}\n, {}'.format(bedroom2, bedroom3, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto