DogGo - This Jupyter notebook contains code to generate the datasets and files required by DogGo.

In [None]:
import requests
from bs4 import BeautifulSoup
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import networkx as nx
import osmnx as ox

import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap

%matplotlib inline

In [None]:
#Assemble dog breed, size, exercise reqs csv
page = requests.get('http://dogtime.com/dog-breeds')
soup = BeautifulSoup(page.text, 'html.parser')
soup_breeds = str(soup.find_all(class_='search-results-list'))
dogs = []

while 'dog-breeds' in soup_breeds:
    soup_breeds = soup_breeds[soup_breeds.index('dog-breeds')+1:]
    slash = soup_breeds.index('/')
    greater = soup_breeds.index('>')
    name = soup_breeds[slash+1:greater-1]
    dogs.append(name)

#List of dogs
dogs = list(dict.fromkeys(dogs))

#Mutt is a bit arbitrary, users should pick a more specific breed or a preset
dogs.remove('mutt')

#Run next cell

In [None]:
#Scrape exercise levels and dog sizes from Dogtime.com
exercise_levels = []
heights = []

for dog in dogs:
    time.sleep(2)
    print(dog)
    url = 'http://dogtime.com/dog-breeds/' + dog
    page = requests.get(str(url).rstrip())
    soup = BeautifulSoup(page.text, 'html.parser')
    soup_string = str(soup)
    exercise_levels.append(int(soup_string[80+soup_string.index('Exercise Needs'):81+soup_string.index('Exercise Needs')]))
    
    if ('Height:<') in soup_string:
        soup_string = soup_string[soup_string.index('Height:<'):]
        heights.append(soup_string[13:soup_string.index('</div><div')])
    else:
        heights.append('0')
        
#Run next cell

In [None]:
#Parse the dog heights to obtain an estimate of height in inches

#Some formats:
#X to Y inches
#A feet to A feet, B inches
#Up to X inches

heights2 = []
for height in heights:
    height = height.lower()
    substrings = ['½','tall at the shoulder', ',', 'inches', 'inch', 'from']
    for string in substrings:
        height = height.replace(string, '')
    height = height.replace('1 foot', '12')
    height = height.replace('2 feet', '24')
    heights2.append(height.split(' '))
    
final_heights = []
for string_list in heights2:
    for i in range(-1 + len(string_list)):
        if string_list[i].isdigit():
            if string_list[i+1].isdigit():
                final_heights.append(int(string_list[i]) + int(string_list[i+1]))
                break
            else:
                final_heights.append(int(string_list[i]))
                break
        
#Run next cell

In [None]:
#Save .csv of dog data
dog_df = ['Small/Low Energy Dog','Medium Size/Energy Dog','Big/High Energy Dog'] + dogs

#Preset selections
exercise_levels_df = [2,3,4] + exercise_levels
heights_df = [10,16,22] + final_heights

dog_df = pd.DataFrame({'Name': dog_df, 'Exercise-Needs': exercise_levels_df, 'Height': heights_df})
dog_df.to_csv('data/dogbreeds.csv')

In [None]:
#Assemble park csv - takes a while

#From Boston Parks and Recreation
#https://www.boston.gov/departments/parks-and-recreation/popular-playgrounds-and-parks-boston
#Column 1 = Name, Column 2 = Address

park_df = pd.read_csv('data/parks.csv')
lats = []
lons = []

#Attempt to find lat and lon of these Boston parks
#1st try match by name, then by address
#Manually removed parks that did not match
for row in park_df.itertuples():
    time.sleep(2)
    park_name = getattr(row,'Name') + ' Boston'
    try:
        park_coords = ox.geo_utils.geocode(park_name)
        lats.append(park_coords[0])
        lons.append(park_coords[1])
    except:
        
        try:
            park_coords = ox.geo_utils.geocode(getattr(row,'Address') + ' Boston')
            lats.append(park_coords[0])
            lons.append(park_coords[1])
        except:
            #print('Not matched:')
            #print(park_name)
            lats.append(0)
            lons.append(0)
            continue

#Make new columns in df and save
park_df['lat'] = lats
park_df['lon'] = lons
park_df.to_csv('data/parks.csv')

In [None]:
#Assemble tree csv

#Boston from https://data.boston.gov/dataset/trees (200k trees)
#Brookline from http://data.brooklinema.gov/datasets/09a978fa7ffc46d7b6ca06adfddecdf8_0 (11k trees)

#tree_df = pd.read_csv('data/boston_trees.csv')
tree_df = pd.read_csv('data/brookline_trees.csv')

#Get bounding box coordinates
xmax = tree_df['X'].max() #East
xmin = tree_df['X'].min() #West
ymax = tree_df['Y'].max() #North
ymin = tree_df['Y'].min() #South

H = ox.graph_from_bbox(ymax, ymin, xmax, xmin, network_type='walk')

node_treecount = {}
for row in tree_df.itertuples():
    lon = getattr(row, 'X')
    lat = getattr(row, 'Y')
    
    node = ox.get_nearest_node(H, (lat, lon))
    if not node in node_treecount:
        node_treecount[int(node)] = 1
    else:
        node_treecount[int(node)] += 1
        
node_tree_df = pd.DataFrame.from_dict(node_treecount, orient='index', columns=['trees'])
#node_tree_df.to_csv('data/boston_nodetrees.csv')
node_tree_df.to_csv('data/brookline_nodetrees.csv')

In [None]:
#After Boston and Brookline datasets generated, merge them
boston_nodetree = pd.read_csv('data/boston_nodetrees.csv')
brookline_nodetree = pd.read_csv('data/brookline_nodetrees.csv')

boston_tree_dict = dict(zip(boston_nodetree.node, boston_nodetree.trees))
brookline_tree_dict = dict(zip(brookline_nodetree.node, brookline_nodetree.trees))

combined_tree_dict = dict(Counter(boston_tree_dict)+Counter(brookline_tree_dict))
combined_tree_df = pd.DataFrame.from_dict(combined_tree_dict, orient='index', columns=['trees'])
combined_tree_df.to_csv('data/combined_nodetrees.csv')

In [None]:
#This cell takes a few minutes
#Generate graph given bounding box coordinates
#This takes (North, South, East, West) lats/lons

xmax = -71.02 #East
xmin = -71.18 #West
ymax = 42.38  #North
ymin = 42.32  #South

G = ox.graph_from_bbox(ymax, ymin, xmax, xmin, network_type='walk')

#Get nodes and edges in dataframes
gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)

#Modify gdf variables
gdf_nodes.drop(columns=['ref', 'highway','osmid','geometry'], inplace=True)
gdf_edges.drop(columns=['maxspeed','osmid','tunnel','ref','name','service','junction','bridge','access','area','geometry','oneway','lanes'], inplace=True)

In [None]:
#Plot graph
fig, ax = ox.plot_graph_routes(G, [])

In [None]:
#Assign tree and safety values to edges
combined_tree_df = pd.read_csv('data/combined_nodetrees.csv')
combined_tree_df.set_index('node', inplace=True)
gdf_nodes = gdf_nodes.join(combined_tree_df)
gdf_nodes['trees'].fillna(0, inplace=True)

#Trees
gdf_edges['trees'] = [int(0.5*(gdf_nodes.loc[u]['trees'] + gdf_nodes.loc[v]['trees'])) for u,v in zip(gdf_edges['u'], gdf_edges['v'])]

#Safety
gdf_edges['safety'] = np.where(gdf_edges['highway']=='residential', 0, 5)

In [None]:
tree_counts = {}
road_safety = {}

#Set each edge's tree weight as the average of the tree weights of the edge's vertices
for row in gdf_edges.itertuples():
    u = getattr(row,'u')
    v = getattr(row,'v')
    key = getattr(row, 'key')
    tree_count = getattr(row, 'trees')
    safety_score = getattr(row, 'safety')

    tree_counts[(u,v,key)] = tree_count
    road_safety[(u,v,key)] = safety_score
    
nx.set_edge_attributes(G, tree_counts, 'numtrees')
nx.set_edge_attributes(G, road_safety, 'safety')

In [None]:
#Save graph and dataframes to pickle
#ox.save_graphml(G, filename='greater_boston')
#gdf_nodes.to_pickle('data/nodes.pkl',protocol=4)
#gdf_edges.to_pickle('data/edges.pkl',protocol=4)

In [None]:
#Generate histogram of street lengths

#Residential streets only
gdf_edges_res = gdf_edges.loc[gdf_edges['highway'] == 'residential']

_ = plt.hist(gdf_edges_res['length'].values.tolist(),bins='auto')
plt.title('Lengths of road segments in Greater Boston', fontsize=20)
plt.ylabel('Frequency', fontsize=16)
plt.xlabel('Length (m)',fontsize=16)
plt.xlim([-5, 300])
plt.show()

In [None]:
#Generate histogram of trees/street

_ = plt.hist(gdf_edges['trees'].tolist(), np.arange(1,50))
plt.title('Trees/street in Boston', fontsize=20)
plt.ylabel('Frequency', fontsize=16)
plt.xlabel('Number of trees/street',fontsize=16)
plt.xlim([0,50])
plt.show()

In [None]:
#Load graph, nodes, and edges (if necessary)
#G = ox.load_graphml(filename='greater_boston')
#gdf_nodes = pd.read_pickle('data/nodes.pkl')
#gdf_edges = pd.read_pickle('data/edges.pkl')

In [None]:
#Generate a heatmap of Boston trees

for_map = pd.read_csv('data/boston_trees.csv')

hm_base = folium.Map(location=[42.3, -71.1], zoom_start=11)
hm_content = HeatMap(list(zip(for_map.Y.values, for_map.X.values)),min_opacity=1,radius=4,blur=5)

hm_base.add_child(hm_content)
#hm_base.save('bostonheatmap.html')

In [None]:
#Generate a heatmap of Brookline trees

for_map = pd.read_csv('data/brookline_trees.csv')

hm_base = folium.Map(location=[42.3, -71.1], zoom_start=11)
hm_content = HeatMap(list(zip(for_map.Y.values, for_map.X.values)),min_opacity=1,radius=4,blur=5)

hm_base.add_child(hm_content)
#hm_base.save('brooklineheatmap.html')

In [None]:
#Generate a map of street-quietness in Boston

edges = gdf_edges.loc[gdf_edges['safety'] == 5]

xs = []
ys = []

for row in edges.itertuples():
    for node in [gdf_nodes.loc[getattr(row,'u')]]:
        xs.append(node['x'])
        ys.append(node['y'])
    
edges.insert(len(edges.columns), 'x', xs)
edges.insert(len(edges.columns), 'y', ys)

hm_base = folium.Map(location=[42.3, -71.1], zoom_start=11)
hm_content = HeatMap(list(zip(edges.y.values, edges.x.values)),min_opacity=1,radius=4,blur=5)

hm_base.add_child(hm_content)
#hm_base.save('quiet_streets.html')