Capstone Project - Battle of Neighborhoods (week 2)
Applied Data Science Capstone
Introduction to Business Problem
Opening a new Italian Restaurant in Atlanta, Georgia
The objective of this report is to determine the best possible location to open an Italian Restaurant in Atlanta based on the different localities of the city, already established Italian restaurant in various geographical location and ease of accessibility by maximum number of people so that the revenue from the latest venture can be maximized.


In [43]:
#Importing required libraries
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim
try:
    import geocoder
except:
    !pip install geocoder
    import geocoder

import requests
from bs4 import BeautifulSoup

try:
    import folium
except:
    !pip install folium
    import folium
    
from sklearn.cluster import KMeans

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

import matplotlib as mpl
import matplotlib.pyplot as plt


In [44]:
# install wordcloud
!pip install wordcloud
# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS

print ('Wordcloud is installed and imported!')


Wordcloud is installed and imported!


In [45]:
#Getting the location of Atlanta using the geocoder package
g = geocoder.arcgis('Atlanta, Georgia, USA')
blr_lat = g.latlng[0]
blr_lng = g.latlng[1]
print("The Latitude and Longitude of the City of Atlanta is {} and {}".format(blr_lat, blr_lng))


The Latitude and Longitude of the City of Atlanta is 33.74831000000006 and -84.39110999999997


In [46]:
#Scraping the Wikimedia webpage for list of localities present in Atlanta, Georgia, USA
neig = requests.get("https://en.wikipedia.org/wiki/Atlanta_metropolitan_area").text

In [47]:
#parsing the scraped content
soup = BeautifulSoup(neig, 'html.parser')

In [48]:
#Creating a list to store neighborhood data
neighborhoodlist = []

In [49]:
#Searching the localities using class labels and appending it to the neighborhood list
for i in soup.find_all('div', class_='category')[0].find_all('a'):
    neighborhoodlist.append(i.text)

#Creating a dataframe from the list
neig_df = pd.DataFrame({"Locality": neighborhoodlist})
neig_df.head()

Unnamed: 0,Locality
0,Metropolitan area


In [50]:
#Shape of dataframe neig_df
neig_df.shape

(1, 1)

In [51]:
#Defining a function to get the location of the localities
def get_location(localities):
    g = geocoder.arcgis('{}, Atlanta, Georgia'.format(localities))
    get_latlng = g.latlng
    return get_latlng

In [52]:
#Creating an empty list
coordinates = []
#Getting the coordinates of each locality using the function defined above
for i in neig_df["Locality"].tolist():
    coordinates.append(get_location(i))
print(coordinates)

[[33.724883586114544, -84.40789176943848]]


In [53]:
coordinates[:5]

[[33.724883586114544, -84.40789176943848]]

In [54]:
#Creating a dataframe from the list of location coordinates
coordinates_df = pd.DataFrame(coordinates, columns=['Latitudes', 'Longitudes'])

In [55]:
#Adding coordinates of localities to neig_df dataframe
neig_df["Latitudes"] = coordinates_df["Latitudes"]
neig_df["Longitudes"] = coordinates_df["Longitudes"]

In [56]:
print("The shape of neig_df is {}".format(neig_df.shape))
neig_df.head()

The shape of neig_df is (1, 3)


Unnamed: 0,Locality,Latitudes,Longitudes
0,Metropolitan area,33.724884,-84.407892


In [57]:
#Creating a map
blr_map = folium.Map(location=[blr_lat, blr_lng],zoom_start=11)

folium.Marker([blr_lat, blr_lng], popup='<i>Atlanta</i>', color='red', tooltip="Click to see").add_to(blr_map)

#markers for localities
for latitude,longitude,name in zip(neig_df["Latitudes"], neig_df["Longitudes"], neig_df["Locality"]):
    folium.CircleMarker(
        [latitude, longitude],
        radius=6,
        color='blue',
        popup=name,
        fill=True,
        fill_color='#3186ff'
    ).add_to(blr_map)

blr_map

In [85]:
# Import Pandas to provide DataFrame support
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import Requests
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

In [86]:
# Use the Requests get method to request the top sites 
page = requests.get(
    "https://foursquare.com/explore?mode=url&near=Morningside%20-%20Lenox%20Park%2C%20Atlanta%2C%20GA&nearGeoId=102851&q=Italian")

# Convert the HTML response into a BeautifulSoup Object
soup = BeautifulSoup(page.content, 'html.parser')

# Use the BeautifulSoup find_all method to extract each top site venue details.
top_venues = soup.find_all('div', class_='venueDetails')

In [87]:
# The column names for the top venues dataframe
venue_columns = ['id', 
                 'score', 
                 'category', 
                 'name', 
                 'address',
                 'postalcode',
                 'city',
                 'href', 
                 'latitude', 
                 'longitude']

# Create the empty top venues dataframe
df_top_venues = pd.DataFrame(columns=venue_columns)

# For each venue in the BeautifulSoup HTML object
for venue in top_venues:
    
    # Extract the available attributes
    venue_name = venue.find(target="_blank").get_text()
    venue_score = venue.find(class_="venueScore positive").get_text()
    venue_cat = venue.find(class_="categoryName").get_text()
    venue_href = venue.find(class_="venueName").h2.a['href']
    venue_id = venue_href.split('/')[-1]

    if 'promotedTipId' in venue_id: 
        continue
        
    # Contruct the FourSquare venue API URL
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(
        venue_id, 
        cfg['5P2CPBW0DMKJQG125AYEIEYPUN2NFJHIPT4XIVJ4HUHKUVWS'],
        cfg['NA13Y2IZLEDBDGCX4VBRY5UE3HMSQ1LTOX3KIVMAKLJQYMSI'],
        cfg['20201130'])
    
    # Request the venue data
    result = requests.get(url).json()
        
    # Get the properly formatted address and the latitude and longitude
    venue_address = result['response']['venue']['location']['address']
    venue_postalcode = result['response']['venue']['location']['postalCode']
    venue_city = result['response']['venue']['location']['city']
    venue_latitude = result['response']['venue']['location']['lat']
    venue_longitude = result['response']['venue']['location']['lng']
    
    # Add the venue to the top venues dataframe
    df_top_venues = df_top_venues.append({'id': venue_id,
                                          'score': venue_score,
                                          'category': venue_cat,
                                          'name': venue_name,
                                          'address': venue_address,
                                          'postalcode': venue_postalcode,
                                          'city': venue_city,
                                          'href': venue_href,
                                          'latitude': venue_latitude,
                                          'longitude': venue_longitude}, ignore_index=True)

In [88]:
# Verify the shape of the top venues dataframe
df_top_venues.shape

(0, 10)

In [74]:
# The score type needs to be converted to float
df_top_venues['score'] = pd.to_numeric(df_top_venues['score'], errors='coerce').fillna(0)

# Describe the score to see if there is nuch variance in the values
df_top_venues.score.describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: score, dtype: float64

In [63]:

# Review the head of the dataframe to make sure it looks as expected
df_top_venues.head()

Unnamed: 0,id,score,category,name,address,postalcode,city,href,latitude,longitude


In [75]:
# The column names for the restaurants dataframe
restaurants_columns = ['id',
                       'score', 
                       'category', 
                       'categoryID', 
                       'name', 
                       'address',
                       'postalcode',
                       'city',
                       'latitude',
                       'longitude', 
                       'venue_name', 
                       'venue_latitude',
                       'venue_longitude']

# Create the empty top venues dataframe
df_restaurant = pd.DataFrame(columns=restaurants_columns)

# Create a list of all the top venue latitude and longitude
top_venue_lats = df_top_venues['latitude'].values
top_venue_lngs = df_top_venues['longitude'].values

# Create a list of all the top venue names
top_venue_names = df_top_venues['name'].values

# Iterate over each of the top venues
# The venue name, latitude and longitude are passed to the loop
for ven_name, ven_lat, ven_long in zip(top_venue_names, top_venue_lats, top_venue_lngs):
    
    # Configure additional Search parameters
    # This is the FourSquare Category Id for all food venues
    categoryId = '4d4b7105d754a06374d81259'
    radius = 500
    limit = 50
    
    # Contruct the FourSquare search API URL
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&categoryId={}&radius={}&limit={}'.format(
        cfg['client_id'],
        cfg['client_secret'],
        ven_lat,
        ven_long,
        cfg['version'],
        categoryId,
        radius,
        limit)
    
    # Make the search request
    results = requests.get(url).json()
    
    # Want a good selection of Restaurents
    # If less than 10 are returned ignore
    if len(results['response']['venues']) < 10:
        continue
        
    # Populate the new dataframe with the list of restaurants
    # Get the values for each Restaurant from the JSON
    for restaurant in results['response']['venues']:
 
        # Sometimes the Venue JSON is missing data. If so ignore and continue
        try:
            # Get location details
            rest_id = restaurant['id']
            rest_category = restaurant['categories'][0]['pluralName']
            rest_categoryID = restaurant['categories'][0]['id']
            rest_name = restaurant['name']
            rest_address = restaurant['location']['address']
            rest_postalcode = restaurant['location']['postalCode']
            rest_city = restaurant['location']['city']
            rest_latitude = restaurant['location']['lat']
            rest_longitude = restaurant['location']['lng']
            
            # Contruct the FourSquare venue API URL to get the venues rating / score
            rest_url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(
                rest_id, 
                cfg['client_id'],
                cfg['client_secret'],
                cfg['version'])

            # Get the restaurant score and href
            result = requests.get(rest_url).json()
            rest_score = result['response']['venue']['rating']
            
            # Add the restaurant details to the dataframe
            df_restaurant = df_restaurant.append({'id': rest_id,
                                                  'score': rest_score,
                                                  'category': rest_category,
                                                  'categoryID': rest_categoryID,
                                                  'name': rest_name,
                                                  'address': rest_address,
                                                  'postalcode': rest_postalcode,
                                                  'city': rest_city,
                                                  'latitude': rest_latitude,
                                                  'longitude': rest_longitude,
                                                  'venue_name': ven_name,
                                                  'venue_latitude': ven_lat,
                                                  'venue_longitude': ven_long}, ignore_index=True)
            
        # If there are any issue with a restaurant ignore and continue
        except:
            continue

In [76]:
# Verify the shape of the restaurants dataframe
df_restaurant.shape

(0, 13)

In [77]:
# Review the head of the dataframe to make sure it looks as expected
df_restaurant.head()

Unnamed: 0,id,score,category,categoryID,name,address,postalcode,city,latitude,longitude,venue_name,venue_latitude,venue_longitude


In [78]:
# Describe the score to see if there is nuch variance in the values
df_restaurant.score.describe()

count       0
unique      0
top       NaN
freq      NaN
Name: score, dtype: object

In [79]:
# How many of the top 30 sites / venues had > 10 restaurants nearby
df_restaurant.venue_name.nunique()

0

In [80]:
# How many unique restaurant categories 
df_restaurant.category.nunique()

0

In [81]:
# How many unique restaurants 
df_restaurant.name.nunique()

0

In [82]:
# What are the top 10 most frequently occuring restaurant types
df_restaurant.groupby('category')['name'].count().sort_values(ascending=False)[:10]

Series([], Name: name, dtype: int64)