# IBM Data Science Professional Certificate Capstone

This notebook is for the neighborhood analyzation project for the data science capstone course on Coursera.

## Introduction
XX

## Part 1 - Identifying our potential locations

In [1]:
# Our needed imports.
#!conda install -c conda-forge folium --yes
import folium
import ibm_boto3
import json
import math
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import requests
import types
from IPython.display import Image 
from sklearn.cluster import KMeans

In [2]:
# Create our corners of Gainesville.
gainesville_north = 29.711381
gainesville_south = 29.596737
gainesville_west = -82.453961
gainesville_east = -82.262119

In [3]:
# Define how many rows and columns we want to create for potential locations.
LOCATION_ROWS = 11
LOCATION_COLUMNS = 16
GAINESVILLE_LATITUDE = 29.662737
GAINESVILLE_LONGITUDE = -82.370212

In [4]:
# Calculate how big each location is.
lat_diff = gainesville_north - gainesville_south
long_diff = gainesville_west - gainesville_east
lat_segment = lat_diff / (LOCATION_ROWS)
long_segment = long_diff / (LOCATION_COLUMNS)

In [5]:
# Generate the center for all locations.
gainesville_locations = pd.DataFrame(columns=['Location', 'Lat', 'Long'])
north_boundary = gainesville_north
for row in range(LOCATION_ROWS):
    south_boundary = north_boundary - lat_segment
    row_center = (north_boundary + south_boundary) / 2
    west_boundary = gainesville_west
    for column in range(LOCATION_COLUMNS):
        east_boundary = west_boundary - long_segment
        column_center = (east_boundary + west_boundary) / 2
        west_boundary = east_boundary
        gainesville_locations = gainesville_locations.append(pd.Series(['{}-{}'.format(row, column), row_center, column_center], index=gainesville_locations.columns), ignore_index=True)
    north_boundary = south_boundary
gainesville_locations.head()

Unnamed: 0,District,Lat,Long
0,0-0,29.70617,-82.447966
1,0-1,29.70617,-82.435976
2,0-2,29.70617,-82.423986
3,0-3,29.70617,-82.411996
4,0-4,29.70617,-82.400005


In [None]:
# Count our potential locations
LOCATION_COUNT = len(gainesville_locations)
LOCATION_COUNT

In [33]:
# Calculate radius of each location

# Get our coordinates
first = gainesville_locations.iloc[0]
second = gainesville_locations.iloc[1]
from_coords = (first['Lat'], first['Long'])
to_coords = (second['Lat'], second['Long'])

# Convert to radians
from_radians = [math.radians(coord) for coord in from_coords]
to_radians = [math.radians(coord) for coord in to_coords]
delta_longitudes = to_radians[1] - from_radians[1]

# Calculate using Haversine formula
angle_degrees = 2 * math.asin(
    math.sqrt(
        math.pow(math.sin((to_radians[0] - from_radians[0])/2), 2) +
        math.cos(from_radians[0]) * math.cos(to_radians[0]) * math.pow(math.sin(delta_longitudes/2), 2)
    )
)

# Convert to meters to get diameter and then divide by 2 to get radius
LOCATION_RADIUS = round((angle_degrees * 6372795) / 2)
print('{} meters'.format(LOCATION_RADIUS))

579 meters


In [7]:
# Create map of Gainesville to see our locations.
general_map = folium.Map(location=[GAINESVILLE_LATITUDE, GAINESVILLE_LONGITUDE], zoom_start=12)

# Add markers to the map for each locations.
for index, row in gainesville_locations.iterrows():
    folium.CircleMarker(
        [row['Lat'], row['Long']],
        radius=17,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(general_map)  
    
general_map

## Part 2 - Getting nearby businesses for each location

In [8]:
# The code was removed by Watson Studio for sharing.

In [9]:
# Prepares our venue DataFrame.
location_venues = pd.DataFrame(columns=[
                            'Location',
                            'Lat',
                            'Long', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category'])

In [34]:
# Function for getting all venues in an area
def get_venues(lat, long, limit):
    # create the API request URL.
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION,
        lat, 
        long, 
        LOCATION_RADIUS, 
        limit)
    
    # Load our results.
    r = requests.get(url)
    results = r.json()
    
    # Get the venues.
    try:
        venues = results["response"]['groups'][0]['items']
        return venues
    except KeyError:
        print('Trouble finding venues for {}. Returned response was:'.format(row['Location']), results["response"])
    return []
        

In [35]:
for index, row in gainesville_locations.iterrows():
    if index % 10 == 0:
        print('Location {} of {}...'.format(index, LOCATION_COUNT))
    venues = get_venues(row['Lat'], row['Long'], 100)
    # Add each venue to our DataFrame.
    for venue in venues:
        location_venues = location_venues.append(pd.Series([
            row['Location'],
            row['Lat'],
            row['Long'],
            venue['venue']['name'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']], index=location_venues.columns), ignore_index=True)

District 0 of 176...
Trouble finding venues for 0-0. Returned response was: {}
Trouble finding venues for 0-1. Returned response was: {}
Trouble finding venues for 0-2. Returned response was: {}
Trouble finding venues for 0-3. Returned response was: {}
Trouble finding venues for 0-4. Returned response was: {}
Trouble finding venues for 0-5. Returned response was: {}
Trouble finding venues for 0-6. Returned response was: {}
Trouble finding venues for 0-7. Returned response was: {}
Trouble finding venues for 0-8. Returned response was: {}
Trouble finding venues for 0-9. Returned response was: {}
District 10 of 176...
Trouble finding venues for 0-10. Returned response was: {}
Trouble finding venues for 0-11. Returned response was: {}
Trouble finding venues for 0-12. Returned response was: {}
Trouble finding venues for 0-13. Returned response was: {}
Trouble finding venues for 0-14. Returned response was: {}
Trouble finding venues for 0-15. Returned response was: {}
Trouble finding venues 

In [12]:
# Quick preview of our venues.
print(location_venues.shape)
location_venues.head()

(320, 7)


Unnamed: 0,District,Lat,Long,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0-0,29.70617,-82.447966,The Hammock Lake,29.707468,-82.44342,Lake
1,0-1,29.70617,-82.435976,"Agile Sports Analytics, LLC",29.704679,-82.431546,Sports Club
2,0-2,29.70617,-82.423986,Flying Ten Airport-OJ8,29.702759,-82.425678,Airport Terminal
3,0-3,29.70617,-82.411996,McGriff Landscaping and Fencing,29.708351,-82.417159,Construction & Landscaping
4,0-4,29.70617,-82.400005,Devil's Millhopper Geological State Park,29.70541,-82.39441,State / Provincial Park


## Part 3 - Get just nearby restaurants for each location

In [13]:
# Sets up our categories
food_category = '4d4b7105d754a06374d81259'

In [14]:
# Prepares our venue DataFrame.
location_restaurants = pd.DataFrame(columns=[
                            'Location',
                            'Lat',
                            'Long',
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category'])

In [15]:
# Function for getting all venues in an area
def get_restaurants(lat, long, limit):
    # create the API request URL.
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&categoryId={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION,
        food_category,
        lat, 
        long, 
        LOCATION_RADIUS, 
        limit)
    
    # Load our results.
    r = requests.get(url)
    results = r.json()
    
    # Get the venues.
    try:
        venues = results["response"]['groups'][0]['items']
        return venues
    except KeyError:
        print('Trouble finding venues for {}. Returned response was:'.format(row['Location']), results["response"])
    return []
        

In [16]:
for index, row in gainesville_locations.iterrows():
    if index % 10 == 0:
        print('Location {} of {}...'.format(index, LOCATION_COUNT))
    venues = get_restaurants(row['Lat'], row['Long'], 100)
    # Add each venue to our DataFrame.
    for venue in venues:
        location_restaurants = location_restaurants.append(pd.Series([
            row['Location'],
            row['Lat'],
            row['Long'],
            venue['venue']['name'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']], index=location_restaurants.columns), ignore_index=True)

District 0 of 176...
Trouble finding venues for 0-0. Returned response was: {}
Trouble finding venues for 0-1. Returned response was: {}
Trouble finding venues for 0-2. Returned response was: {}
Trouble finding venues for 0-3. Returned response was: {}
Trouble finding venues for 0-4. Returned response was: {}
Trouble finding venues for 0-5. Returned response was: {}
Trouble finding venues for 0-6. Returned response was: {}
Trouble finding venues for 0-7. Returned response was: {}
Trouble finding venues for 0-8. Returned response was: {}
Trouble finding venues for 0-9. Returned response was: {}
District 10 of 176...
Trouble finding venues for 0-10. Returned response was: {}
Trouble finding venues for 0-11. Returned response was: {}
Trouble finding venues for 0-12. Returned response was: {}
Trouble finding venues for 0-13. Returned response was: {}
Trouble finding venues for 0-14. Returned response was: {}
Trouble finding venues for 0-15. Returned response was: {}
Trouble finding venues 

In [17]:
# Quick preview of our venues.
print(location_restaurants.shape)
location_restaurants.head()

(0, 7)


Unnamed: 0,District,Lat,Long,Venue,Venue Latitude,Venue Longitude,Venue Category


In [18]:
# Let's see how many locations have at least one restaurant.
print('Total locations with at least one restaurant: {}'.format(len(location_restaurants.groupby('Location').count())))

Total districts with at least one restaurant: 0


## Part 4 - Determining which locations are best for new bakery

### First, cluster our districts using Kmeans

In [19]:
# Get our dummified categories.
venue_dummified = pd.get_dummies(location_venues[['Venue Category']], prefix="", prefix_sep="")

# Add our location back to dataframe.
venue_dummified['Location'] = location_venues['Location'] 

# Move location column to the beginning.
# Thanks to https://stackoverflow.com/a/56479671 😅
venue_dummified = venue_dummified[ ['Location'] + [ col for col in venue_dummified.columns if col != 'Location' ] ]

In [20]:
# Review our dataframe.
print('Shape:', venue_dummified.shape)
venue_dummified.head()

Shape: (320, 130)


Unnamed: 0,District,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arts & Crafts Store,Athletics & Sports,...,Tanning Salon,Tennis Court,Theater,Thrift / Vintage Store,Tourist Information Center,Toy / Game Store,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Yoga Studio
0,0-0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0-2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0-3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0-4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Calculate our mean venue categories per location.
venue_groups = venue_dummified.groupby('Location').mean().reset_index()
print('Shape:',venue_groups.shape)
venue_groups.head()

Shape: (58, 130)


Unnamed: 0,District,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arts & Crafts Store,Athletics & Sports,...,Tanning Salon,Tennis Court,Theater,Thrift / Vintage Store,Tourist Information Center,Toy / Game Store,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Yoga Studio
0,0-0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
num_top_venues = 10

# Create columns according to number of top venues.
indicators = ['st', 'nd', 'rd']
columns = ['Location']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# Create a new empty dataframe with our new columns and add in our locations.
location_venues_sorted = pd.DataFrame(columns=columns)
location_venues_sorted['Location'] = venue_groups['Location']

# Cycle over location groups...
for index, row in venue_groups.iterrows():
    # And add in num_top_venues of the top venue categories to each location.
    location_venues_sorted.iloc[index, 1:] = row.iloc[1:].sort_values(ascending=False).index.values[0:num_top_venues]

location_venues_sorted.head()

Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0-0,Lake,Yoga Studio,Food Truck,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop
1,0-1,Sports Club,Yoga Studio,Donut Shop,Flower Shop,Fish Market,Fast Food Restaurant,Farmers Market,Electronics Store,Dog Run,Food Truck
2,0-10,Electronics Store,Yoga Studio,Home Service,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run
3,0-13,Donut Shop,Yoga Studio,Home Service,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run
4,0-14,Convenience Store,Pet Store,Yoga Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop


In [23]:
# Our number of clusters.
kclusters = 10

# Calculate our KMeans.
location_groups_clustering = venue_groups.drop('Location', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(location_groups_clustering)

In [24]:
# Add our clustering labels to our dataframe.
location_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [25]:
# Start preparing our final dataframe.
location_df_final = gainesville_locations.copy()

# Merge in our location clustering results.
location_df_final = location_df_final.join(location_venues_sorted.set_index('Location'), on='Location')

# If any location didn't have venues or ended with NaN scores, let's drop it.
district_df_final = location_df_final.dropna()

# Make sure the cluster labels are in int for our calculations.
location_df_final['Cluster Labels'] = location_df_final['Cluster Labels'].astype('int32')

location_df_final.head()

Unnamed: 0,District,Lat,Long,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0-0,29.70617,-82.447966,3,Lake,Yoga Studio,Food Truck,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop
1,0-1,29.70617,-82.435976,3,Sports Club,Yoga Studio,Donut Shop,Flower Shop,Fish Market,Fast Food Restaurant,Farmers Market,Electronics Store,Dog Run,Food Truck
2,0-2,29.70617,-82.423986,3,Airport Terminal,Yoga Studio,Food Truck,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop
3,0-3,29.70617,-82.411996,7,Construction & Landscaping,Home Service,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop
4,0-4,29.70617,-82.400005,7,Construction & Landscaping,State / Provincial Park,Tennis Court,Yoga Studio,Donut Shop,Fish Market,Fast Food Restaurant,Farmers Market,Electronics Store,Dog Run
5,0-5,29.70617,-82.388015,3,Pizza Place,Pharmacy,Business Service,Sushi Restaurant,Seafood Restaurant,Dog Run,Skate Park,Bank,Chinese Restaurant,Sandwich Place
6,0-6,29.70617,-82.376025,6,Accessories Store,Home Service,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop
7,0-7,29.70617,-82.364035,3,Breakfast Spot,Yoga Studio,Convenience Store,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dog Run,Donut Shop
8,0-8,29.70617,-82.352045,3,Big Box Store,Bakery,Construction & Landscaping,Park,Moving Target,Farmers Market,Sandwich Place,Liquor Store,Convenience Store,Golf Course
9,0-9,29.70617,-82.340055,3,Breakfast Spot,Intersection,Sandwich Place,American Restaurant,Yoga Studio,Donut Shop,Fish Market,Fast Food Restaurant,Farmers Market,Electronics Store


In [26]:
# Create our map.
map_clusters = folium.Map(location=[GAINESVILLE_LATITUDE, GAINESVILLE_LONGITUDE], zoom_start=10)

# Set up different colors for each cluster
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add each location as a marker on the map.
markers_colors = []
for lat, lon, poi, cluster in zip(location_df_final['Lat'], location_df_final['Long'], location_df_final['Location'], location_df_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Next, determine which clusters contain the most bakeries

We'll use this to determine which type of location best works for a bakery.

In [27]:
# Now, get the locations with a bakery in it.
bakery_locations = location_venues[location_venues['Venue Category'] == 'Bakery']
print('Total locations with bakeries: {}'.format(len(bakery_locations.groupby('Location').count())))
bakery_locations.head()

Total districts with bakeries: 0


Unnamed: 0,District,Lat,Long,Venue,Venue Latitude,Venue Longitude,Venue Category


In [28]:
# Determine which cluster has most bakeries in it.
bakery_locations.merge(location_df_final)['Cluster Labels'].value_counts()

Series([], Name: Cluster Labels, dtype: int64)

### Finally, determine which locations in that cluster do not have any bakery

In [29]:
# Get all locations with at least one restaurant.
locations_with_restaurants = location_df_final[location_df_final['Location'].isin(location_restaurants['Location'])]
print(locations_with_restaurants.shape)
locations_with_restaurants.head()

(0, 14)


Unnamed: 0,District,Lat,Long,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [30]:
# Get all locations with at least one restaurant within cluster 1 (the one with most bakeries).
cluster_one_restaurant_locations = locations_with_restaurants[locations_with_restaurants['Cluster Labels'] == 1]
print(cluster_one_restaurant_locations.shape)
cluster_one_restaurant_locations.head()

(0, 14)


Unnamed: 0,District,Lat,Long,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [31]:
# Get all locations with at least one restaurant within cluster 1 that do not have a bakery in it.
potential_locations = cluster_one_restaurant_locations[False == cluster_one_restaurant_locations['Location'].isin(bakery_locations['Location'])]
print(potential_locations.shape)
potential_locations.head()

(0, 14)


Unnamed: 0,District,Lat,Long,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [32]:
# Create map of Gainesville to see our districts.
general_map = folium.Map(location=[GAINESVILLE_LATITUDE, GAINESVILLE_LONGITUDE], zoom_start=12)

# Add markers to the map for each districts.
for index, row in potential_locations.iterrows():
    folium.CircleMarker(
        [row['Lat'], row['Long']],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(general_map)  
    
general_map