# Capstone Project
Use https://nbviewer.jupyter.org to view Github .ipynb files. Copy and paste Github link

## Code



In [1]:
# Importing all necessary libraries from the "Segmenting and Clustering Neighborhoods in New York City" lab
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Importing the Web Scraping libraries
# Install these into your system/environment if not downloaded
import requests
import lxml.html as lh

def get_content(url):
    #Create a handle, page, to handle the contents of the website
    page = requests.get(url)
    #Store the contents of the website under doc
    doc = lh.fromstring(page.content)
    return doc

def get_coord(zip_code):
    url = 'https://www.zip-codes.com/m/zip-code/' + zip_code + '/zip-code-' + zip_code + '.asp'
    zipDoc = get_content(url)
    tr_elements2 = zipDoc.xpath('//tr')
    lat = float(tr_elements2[11].text_content()[9:])
    long = float(tr_elements2[12].text_content()[10:])
    return (lat,long)

# The url for the wiki page
url = 'https://www.zip-codes.com/m/city/tx-austin.asp'
doc = get_content(url)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

# Geting Column Names
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    col.append((name,[]))

# Getting the values for each columns
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 4, the //tr data is not from our table 
    if len(T)!=4:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df = df[['ZIP Code']]
df.columns = ['zipCode']

df[['Latitude','Longitude']] = pd.DataFrame(df.zipCode.apply(get_coord).tolist())
df.head()


Unnamed: 0,zipCode,Latitude,Longitude
0,73301,30.219702,-97.74726
1,73344,30.2669,-97.7429
2,78701,30.2672,-97.742306
3,78702,30.263915,-97.71366
4,78703,30.28973,-97.766479


In [2]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [3]:
address = 'Austin, TX'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Austin are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Austin are 30.2711286, -97.7436995.


In [5]:
# create map of Austin using latitude and longitude values
map_austin = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map. On my PC .CircleMarker() does not have the parameter 'parse_html' so I removed it.
# I also changed the dataframe name to dfComplete
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['zipCode']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_austin)  
    
map_austin

In [6]:
CLIENT_ID = 'UPX3DQWNOVURCU4JGBGCMOQU023OTZJIYE3QWLKF53XUFML4' # your Foursquare ID
CLIENT_SECRET = 'JWPBHGHKGM1CI3WFWDFHOQYVG0WZI2ZSHCM4GT5QZU1YMLA4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UPX3DQWNOVURCU4JGBGCMOQU023OTZJIYE3QWLKF53XUFML4
CLIENT_SECRET:JWPBHGHKGM1CI3WFWDFHOQYVG0WZI2ZSHCM4GT5QZU1YMLA4


In [7]:
postal_latitude = df.loc[0, 'Latitude'] # Postal Code latitude value
postal_longitude = df.loc[0, 'Longitude'] # Postal Code longitude value

postal_name = df.loc[0, 'zipCode'] # Postal Code

print('Latitude and longitude values of {} are {}, {}.'.format(postal_name, 
                                                               postal_latitude, 
                                                               postal_longitude))
LIMIT = 30
radius = 804.672# meters; .5 miles
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    postal_latitude, 
    postal_longitude, 
    radius, 
    LIMIT)

Latitude and longitude values of 73301 are 30.219702, -97.74726.


In [8]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [9]:
results = requests.get(url).json()

In [10]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]


print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

26 venues were returned by Foursquare.


In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=804.672, limit = 50):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
austinV = getNearbyVenues(names=df['zipCode'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

73301
73344
78701
78702
78703
78704
78705
78708
78709
78710
78711
78712
78713
78714
78715
78716
78717
78718
78719
78720
78721
78722
78723
78724
78725
78726
78727
78728
78729
78730
78731
78732
78733
78734
78735
78736
78737
78738
78739
78741
78742
78744
78745
78746
78747
78748
78749
78750
78751
78752
78753
78754
78755
78756
78757
78758
78759
78760
78761
78762
78763
78764
78765
78766
78767
78768
78772
78773
78774
78778
78779
78783
78799


In [14]:
print(austinV.shape)

(2048, 7)


In [15]:
austinV.groupby('PostalCode').count()

Unnamed: 0_level_0,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
73301,26,26,26,26,26,26
73344,50,50,50,50,50,50
78701,50,50,50,50,50,50
78702,47,47,47,47,47,47
78703,8,8,8,8,8,8
78704,25,25,25,25,25,25
78705,50,50,50,50,50,50
78708,50,50,50,50,50,50
78709,50,50,50,50,50,50
78710,9,9,9,9,9,9
