# IBM Data Science Professional Certificate Capstone

This notebook is for the neighborhood analyzation project for the data science capstone course on Coursera.

## Introduction
XX

# Part 1 - Identifying our districts

In [1]:
# Our needed imports.
!conda install -c conda-forge folium --yes
!conda install -c conda-forge geopy --yes
import folium
import ibm_boto3
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import requests
import types
from botocore.client import Config
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from IPython.display import Image 
from sklearn.cluster import KMeans

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.4.5.2 |       hecda079_0         147 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.4.5.2         |   py36h9f0ad1d_0         152 KB  conda-forge
    folium-0.11.0              |             py_0          61 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    branca:          0.4.1-py_0        conda-forge
    folium:          

In [2]:
# Create our corners of Gainesville.
gainesville_north = 29.711381
gainesville_south = 29.596737
gainesville_west = -82.453961
gainesville_east = -82.262119

In [3]:
# Define how many rows and columns we want to create for districts.
DISTRICT_ROWS = 11
DISTRICT_COLUMNS = 16
GAINESVILLE_LATITUDE = 29.662737
GAINESVILLE_LONGITUDE = -82.370212

In [4]:
# Calculate how big each segment is.
lat_diff = gainesville_north - gainesville_south
long_diff = gainesville_west - gainesville_east
lat_segment = lat_diff / (DISTRICT_ROWS)
long_segment = long_diff / (DISTRICT_COLUMNS)

In [5]:
# Generate the center for all segments.
gainesville_districts = pd.DataFrame(columns=['District', 'Lat', 'Long'])
north_boundary = gainesville_north
for row in range(DISTRICT_ROWS):
    south_boundary = north_boundary - lat_segment
    row_center = (north_boundary + south_boundary) / 2
    west_boundary = gainesville_west
    for column in range(DISTRICT_COLUMNS):
        east_boundary = west_boundary - long_segment
        column_center = (east_boundary + west_boundary) / 2
        west_boundary = east_boundary
        gainesville_districts = gainesville_districts.append(pd.Series(['{}-{}'.format(row, column), row_center, column_center], index=gainesville_districts.columns), ignore_index=True)
    north_boundary = south_boundary
gainesville_districts.head()

Unnamed: 0,District,Lat,Long
0,0-0,29.70617,-82.447966
1,0-1,29.70617,-82.435976
2,0-2,29.70617,-82.423986
3,0-3,29.70617,-82.411996
4,0-4,29.70617,-82.400005


In [6]:
# Create map of Gainesville to see our districts.
general_map = folium.Map(location=[GAINESVILLE_LATITUDE, GAINESVILLE_LONGITUDE], zoom_start=12)

# Add markers to the map for each districts.
for index, row in gainesville_districts.iterrows():
    folium.CircleMarker(
        [row['Lat'], row['Long']],
        radius=17,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(general_map)  
    
general_map

# Part 2 - Getting businesses for each district

In [7]:
# The code was removed by Watson Studio for sharing.

In [9]:
# Prepares our venue DataFrame.
neighborhood_venues = pd.DataFrame(columns=[
                            'District',
                            'District Latitude',
                            'District Longitude', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category'])

In [10]:
# Function for getting all venues in an area
def get_venues(lat, long, radius, limit):
    # create the API request URL.
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION,
        lat, 
        long, 
        radius, 
        limit)
    
    # Load our results.
    r = requests.get(url)
    results = r.json()
    
    # Get the venues.
    try:
        venues = results["response"]['groups'][0]['items']
        return venues
    except KeyError:
        print('Trouble finding venues for {}. Returned response was:'.format(row['District']), results["response"])
    return []
        

In [11]:
district_count = len(gainesville_districts)
for index, row in gainesville_districts.iterrows():
    if index % 10 == 0:
        print('District {} of {}...'.format(index, district_count))
    venues = get_venues(row['Lat'], row['Long'], 500, 100)
    # Add each venue to our DataFrame.
    for venue in venues:
        neighborhood_venues = neighborhood_venues.append(pd.Series([
            row['District'],
            row['Lat'],
            row['Long'],
            venue['venue']['name'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']], index=neighborhood_venues.columns), ignore_index=True)

District 0 of 176...
District 10 of 176...
District 20 of 176...
District 30 of 176...
District 40 of 176...
District 50 of 176...
District 60 of 176...
District 70 of 176...
District 80 of 176...
District 90 of 176...
District 100 of 176...
District 110 of 176...
District 120 of 176...
District 130 of 176...
District 140 of 176...
District 150 of 176...
District 160 of 176...
District 170 of 176...


In [12]:
# Quick preview of our venues.
print(neighborhood_venues.shape)
neighborhood_venues.head()

(348, 7)


Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0-5,29.70617,-82.388015,China Bowl,29.702655,-82.390303,Chinese Restaurant
1,0-5,29.70617,-82.388015,Cedar River Seafood,29.701723,-82.387995,Seafood Restaurant
2,0-5,29.70617,-82.388015,SUBWAY,29.702775,-82.390566,Sandwich Place
3,0-5,29.70617,-82.388015,Flowers Bakery,29.702798,-82.387151,Bakery
4,0-5,29.70617,-82.388015,Volcanic Sushi + Sake,29.702961,-82.390351,Sushi Restaurant


## Part 3 - Get just restaurants for each district

In [8]:
# Sets up our categories
food_category = '4d4b7105d754a06374d81259'

In [None]:
# Prepares our venue DataFrame.
neighborhood_restaurants = pd.DataFrame(columns=[
                            'District',
                            'District Latitude',
                            'District Longitude', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category'])

In [None]:
# Function for getting all venues in an area
def get_restaurants(lat, long, radius, limit):
    # create the API request URL.
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&categoryId={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION,
        food_category,
        lat, 
        long, 
        radius, 
        limit)
    
    # Load our results.
    r = requests.get(url)
    results = r.json()
    
    # Get the venues.
    try:
        venues = results["response"]['groups'][0]['items']
        return venues
    except KeyError:
        print('Trouble finding venues for {}. Returned response was:'.format(row['District']), results["response"])
    return []
        

In [None]:
district_count = len(gainesville_districts)
for index, row in gainesville_districts.iterrows():
    if index % 10 == 0:
        print('District {} of {}...'.format(index, district_count))
    venues = get_restaurants(row['Lat'], row['Long'], 500, 100)
    # Add each venue to our DataFrame.
    for venue in venues:
        neighborhood_restaurants = neighborhood_venues.append(pd.Series([
            row['District'],
            row['Lat'],
            row['Long'],
            venue['venue']['name'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']], index=neighborhood_restaurants.columns), ignore_index=True)

In [None]:
# Quick preview of our venues.
print(neighborhood_restaurants.shape)
neighborhood_restaurants.head()

In [26]:
# Let's see how many districts have at least one restaurant.
print('Total districts with at least one restaurant: {}'.format(len(neighborhood_restaurants.groupby('District').count())))

Total districts with at least one restaurant: 78


In [25]:
# Now, get the districts with a bakery in it.
bakery_districts = neighborhood_restaurants[neighborhood_restaurants['Venue Category'] == 'Bakery']
len(bakery_districts)

13

# Part 4 - Determining which districts are best for new bakery

In [27]:
bakery_districts

Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
3,0-5,29.70617,-82.388015,Flowers Bakery,29.702798,-82.387151,Bakery
8,0-8,29.70617,-82.352045,Walmart Bakery,29.70644,-82.35684,Bakery
29,2-11,29.685326,-82.316075,Sunbeam Bakery-Wholesale,29.687512,-82.319959,Bakery
30,2-13,29.685326,-82.292094,Country Hearth Bakery,29.681937,-82.291785,Bakery
38,3-5,29.674903,-82.388015,Uppercrust,29.674301,-82.387022,Bakery
85,3-12,29.674903,-82.304084,Beehive Bakery,29.6738,-82.302096,Bakery
124,5-3,29.654059,-82.411996,Cinnabon,29.656574,-82.411222,Bakery
133,5-3,29.654059,-82.411996,Mrs Field's Cookies,29.657259,-82.411636,Bakery
192,5-10,29.654059,-82.328065,Cookiegazm,29.651825,-82.32674,Bakery
289,8-6,29.622792,-82.376025,Panera Bread,29.625161,-82.373782,Bakery
