## Scraping Google Map data with locations of Calgary pubs

Base URL

https://www.google.com/maps/search/pubs+in+calgary/@51.0453812,-114.2203065,12z

Inspect page, select body, right click then "edit as HTML", copy into .html file, run code in this notebook. 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster

In [None]:
# Scrape data
page = open("./google_maps.html")
soup = BeautifulSoup(page.read())

# attempt 1
subsection = soup.find_all("div",
                           {"class":"section-layout section-scrollbox scrollable-y scrollable-show section-layout-flex-vertical section-layout-inset-shadow"})
google_locations = subsection[0].find_all("a",href=True)

# attempt 2
pub_name = []
pub_href = []
for a in soup.find_all('a', href=True):
    try:
        pub_name.append(a['aria-label'])
        pub_href.append(a['href'])
        #print("Found the URL:", a['href'])
        #print("With name",a['aria-label'])
    except:
        continue

In [None]:
# Selecting data with pub information
pub_name = pub_name[1:-1]
pub_href = pub_href[1:-1]

In [None]:
#longitude
pub_longitude = [float(pub_href[i].split("data")[1].split("!")[-1][2:13]) for i in range(len(pub_href))]
#latitude
pub_latitude = [float(pub_href[i].split("data")[1].split("!")[-2][2:]) for i in range(len(pub_href))]

In [None]:
# form dataframe
calgary_pubs = pd.DataFrame({"PubName":pub_name,
             "Longitude":pub_longitude,
             "Latitude":pub_latitude,
             "GoogleURL":pub_href})
calgary_pubs.to_csv("Pub_data_Calgary.csv")

In [None]:
latitude = calgary_pubs['Latitude'].to_list()[0]
longitude = calgary_pubs["Longitude"].to_list()[0]


# Initial coordinates 
SC_COORDINATES = [latitude, longitude]

# Create a map using our initial coordinates
map_osm=folium.Map(location=SC_COORDINATES, zoom_start=10, tiles='Stamen Terrain')

#Create marker cluster and add to our map
marker_cluster = MarkerCluster().add_to(map_osm)

# Iterate over each record, 
MAX_RECORDS = len(calgary_pubs)
# For each record in rma_sample
for each in calgary_pubs[0:MAX_RECORDS].iterrows():
    # Use folium.Marker function, use X and Y coordinates to specify location
    folium.Marker(location = [each[1]['Latitude'],each[1]['Longitude']],
                 # Add pub name
                  popup=folium.Popup(each[1]['PubName'],sticky=True)).add_to(map_osm)

# Show the map
display(map_osm)

In [None]:
# Plot
plt.scatter(calgary_pubs["Longitude"],calgary_pubs['Latitude'])
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Calgary Pubs")
plt.show()

In [None]:
from math import sin, cos, sqrt, atan2, radians
import numpy as np

# approximate radius of earth in km
# R = 6373.0

# lat1 = radians(52.2296756)
# lon1 = radians(21.0122287)
# lat2 = radians(52.406374)
# lon2 = radians(16.9251681)

# dlon = lon2 - lon1
# dlat = lat2 - lat1

# a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
# c = 2 * atan2(sqrt(a), sqrt(1 - a))

# distance = R * c

# print("Result:", distance)
# print("Should be:", 278.546, "km")

In [None]:
rad_latitude = []
rad_longitude = []
for lat,long in zip(calgary_pubs['Latitude'].to_list(),calgary_pubs['Longitude'].to_list()):
    rad_latitude.append(radians(lat))
    rad_longitude.append(radians(long))

In [None]:
calgary_pubs["RadLat"] = rad_latitude
calgary_pubs["RadLong"] = rad_longitude

In [None]:
# from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
# approximate radius of earth in km
R = 6373.0

# Initialize matrix
result_matrix = np.zeros((22,22))
# Initialize row index
i=0

for lat1,lon1 in zip(calgary_pubs['RadLat'].to_list(),calgary_pubs['RadLong']):
    # Initialize column index
    j = 0
    for lat2,lon2 in zip(calgary_pubs['RadLat'].to_list(),calgary_pubs['RadLong']):
        # Compute pairwise longitude and latitude difference
        delta_lon = lon2-lon1
        delta_lat = lat2 - lat1
        
        # Computing distance
        a = sin(delta_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(delta_lon / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = R*c
        # Appending results
        result_matrix[i,j] = distance
        # Increase col number
        j+=1
    # Increase row number
    i+=1


In [None]:
#result_matrix = np.zeros((22,22))
distance_matrix = pd.DataFrame(result_matrix)
# Rename columns
distance_matrix.columns = calgary_pubs['PubName'].to_list()
# Rename rows
new_indeces = {i: calgary_pubs['PubName'].to_list()[i] for i in range(22)}
distance_matrix.rename(index=new_indeces,inplace=True)

In [None]:
distance_matrix

In [None]:
# From https://networkx.org/documentation/latest/auto_examples/drawing/plot_weighted_graph.html
import matplotlib.pyplot as plt
import networkx as nx

G = nx.Graph()

for itema in distance_matrix.columns:
    for itemb in distance_matrix.index:
        G.add_edge(itema,itemb,weight=distance_matrix[itema][itemb])
        
elarge = [(u, v,d) for (u, v, d) in G.edges(data=True) if d["weight"] > 0.6]
esmall = [(u, v,d) for (u, v, d) in G.edges(data=True) if d["weight"] <= 0.6]
    
pos = nx.spring_layout(G, seed=7)  # positions for all nodes - seed for reproducibility

# nodes
nx.draw_networkx_nodes(G, pos, node_size=700)

# edges
#nx.draw_networkx_edges(G, pos, edgelist=elarge, width=6)
nx.draw_networkx_edges(
    G, pos, edgelist=esmall, width=3, alpha=0.5, edge_color="b", style="dashed"
)

# labels
nx.draw_networkx_labels(G, pos, font_size=20, font_family="sans-serif")
#nx.draw_networkx_edge_labels(G, pos, edgelist=esmall,font_size=10, font_family="sans-serif")
nx.draw_networkx_edge_labels(G,pos,edge_labels={item[0:2]:item[2]['weight'] for item in esmall},font_color='black')

ax = plt.gca()
plt.rcParams["figure.figsize"] = (15,10)

ax.margins(0.01)
plt.axis("off")
plt.tight_layout()
plt.show()