# Preparations

In [None]:
#First, lets import all the libraries to be used on this notebook
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

# Data acquisition and preparation

In [None]:
#Get cities in MN from wiki
#Download page and store locally
url_mn  = "https://en.wikipedia.org/wiki/List_of_cities_in_Minnesota"
page = requests.get(url_mn)
if page.status_code == 200:
    print('Page download successful')
else:
    print('Page download error. Error code: {}'.format(page.status_code))

In [None]:
#read_html
#convert "Not Assigned" to NaN to use dropna
df_mn = pd.read_html(url_mn)[0]
df_mn.head()

In [None]:
#Get safety index
#Download page and store locally
url_mn_safety  = "https://backgroundchecks.org/safest-cities-in-minnesota.html"
page = requests.get(url_mn_safety)
if page.status_code == 200:
    print('Page download successful')
else:
    print('Page download error. Error code: {}'.format(page.status_code))

In [None]:
#read_html
#convert "Not Assigned" to NaN to use dropna
df_mnsafety = pd.read_html(url_mn_safety)[0]
df_mnsafety.head()

In [None]:
# Merge two df
df_merge = pd.merge(df_mn, df_mnsafety, on='City')
df_merge.head()

In [None]:
df_merge.shape

In [None]:
#pick the city with positive safety index
df_safe = df_merge[df_merge['Safety Index']>0]
df_safe.head()

In [None]:
df_safe = df_safe.drop(columns = ['2019 Rank','2019 Estimate[4]','2010 Census[5]','Rank','Change'])
df_safe.head()

In [None]:
a = ['Minnesota']*df_safe.shape[0]
df_safe['State'] = a
b = ['United States']*df_safe.shape[0]
df_safe['Country'] = b
df_safe = df_safe.drop(columns = ['County'])
df_safe.head()

In [None]:
#Since there are only six central districts in Tianjin, we manually create the dataframe and name them as cities for consistency
tj = pd.DataFrame(data={'City': ['Hedong','Heping','Hexi','Hebei','Nankai','Hongqiao']})
c = ['Tianjin']*tj.shape[0]
d = ['China']*tj.shape[0]
tj['State'] = c
tj['Country'] = d
tj.head()

In [None]:
#combine cities in Tianjin and Minnesota
df_all = pd.concat([tj, df_safe],ignore_index=True)
df_all

In [None]:
all_cities= pd.DataFrame(columns = ['City','State','Country'])

for ii in range(df_all.shape[0]):
    city = df_all.loc[ii,'City']
    state = df_all.loc[ii,'State']
    country = df_all.loc[ii,'Country']
    
    #find the location data, ignore the neighborhoods that are unable to be located by Nominatim
    geolocator = Nominatim(user_agent="address_explorer")
    location = geolocator.geocode("{}, {}, {}".format(city, state, country))
          
    if location is None: 
        print("The location data of {} in {} is not available!".format(city,state))
    else:
        all_cities = all_cities.append({'City': city,
                                        'State': state,
                                        'Country': country,
                                        'Latitude': location.latitude,
                                        'Longitude': location.longitude
                                        }, ignore_index=True)

In [None]:
all_cities.head()

In [None]:
tj_cities = all_cities[:6]
mn_cities = all_cities[6:]

In [None]:
# Use geopy library to get the latitude and longitude values of Minnesota areas
addressmn = 'Minnesota, United States'

geolocator = Nominatim(user_agent="mn_explorer")
locationmn = geolocator.geocode(addressmn)
latitudemn = locationmn.latitude
longitudemn = locationmn.longitude
print('The geograpical coordinate of MN is {}, {}.'.format(latitudemn, longitudemn))

In [None]:
# Use geopy library to get the latitude and longitude values of Tianjin areas
addresstj = 'Tianjin, China'

geolocator = Nominatim(user_agent="tj_explorer")
locationtj = geolocator.geocode(addresstj)
latitudetj = locationtj.latitude
longitudetj = locationtj.longitude
print('The geograpical coordinate of TJ is {}, {}.'.format(latitudetj, longitudetj))

In [None]:
# create map of mn using latitude and longitude values
map_mn_cities = folium.Map(location=[latitudemn, longitudemn], zoom_start=10)

# add markers to map
for lat, lng, city in zip(mn_cities['Latitude'], mn_cities['Longitude'], mn_cities['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_mn_cities)  
    
map_mn_cities

In [None]:
# create map of mn using latitude and longitude values
map_tj_cities = folium.Map(location=[latitudetj, longitudetj], zoom_start=10)

# add markers to map
for lat, lng, city in zip(tj_cities['Latitude'], tj_cities['Longitude'], tj_cities['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tj_cities)  
    
map_tj_cities


In [None]:
#Define Foursquare Credentials and Version
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
LIMIT = 1000 # limit of number of venues returned by Foursquare API

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# write the code to run the above function on each neighborhood and create a new dataframe called all_venues
all_venues = getNearbyVenues(names=all_cities['City'],
                                   latitudes=all_cities['Latitude'],
                                   longitudes=all_cities['Longitude']
                                  )



In [None]:
all_venues.head()

In [None]:
all_venues.shape

In [None]:
#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(all_venues['Venue Category'].unique())))

# Data analysis

In [None]:
# one hot encoding
allvenues_onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
allvenues_onehot['City'] = all_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [allvenues_onehot.columns[-1]] + list(allvenues_onehot.columns[:-1])
allvenues_onehot = allvenues_onehot[fixed_columns]

allvenues_onehot.head()

In [None]:
allvenues_onehot.shape

In [None]:
# data cleaning
def clean_columns(columns,newcolumnname,dataframe):
    value = 0
    for column in columns:
        value = dataframe[column] + value
    
    dataframe = dataframe.drop(columns,axis = 1)
    dataframe[newcolumnname] = value
    
    return dataframe

columns = ['Art Gallery', 'Arts & Crafts Store','Museum']
newcolumnname = 'Art Museum'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)


columns = ['Baseball Field','Baseball Stadium']
newcolumnname = 'Baseball Field'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)


columns = ['Soccer Field', 'Soccer Stadium','Stadium']
newcolumnname = 'Stadium'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)


columns = ['Bar','Beer Garden', 'Cocktail Bar','Whisky Bar', 'Wine Bar','Hotel Bar']
newcolumnname = 'Bar'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Wine Shop','Liquor Store']
newcolumnname = 'Liquor'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)


columns = [ 'Café','Coffee Shop','Gaming Cafe']
newcolumnname = 'Cafeteria'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Food', 'Food & Drink Shop',
        'Food Service', 'Food Truck']
newcolumnname = 'Street Food'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)
  
columns = ['Shopping Mall', 'Shopping Plaza']
newcolumnname = 'Shopping Mall'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)



columns = ['Sporting Goods Shop', 'Sports Bar']
newcolumnname = 'Sports'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Gym','Gym / Fitness Center', 'Gym Pool', 'Gymnastics Gym']
newcolumnname = 'Gym'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = [ 'Japanese Restaurant','Ramen Restaurant','Sushi Restaurant']
newcolumnname = 'Japanese Restaurant'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Metro Station','Light Rail Station']
newcolumnname = 'Metro Station'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)


columns = [ 'Pet Service', 'Pet Store','Animal Shelter']
newcolumnname = 'Pet Service'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)


columns = ['Chinese Restaurant','Hotpot Restaurant', 
           'Dim Sum Restaurant','Dumpling Restaurant','Taiwanese Restaurant',
          ]
newcolumnname = 'Chinese Restaurant'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

#allvenues_onehot.columns.values

In [None]:
allvenues_onehot.head()

In [None]:
#Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
all_grouped = allvenues_onehot.groupby('City').mean().reset_index()
all_grouped

In [None]:
#Let's print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in all_grouped['City']:
    print("----"+hood+"----")
    temp = all_grouped[all_grouped['City'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
#Let's put that into a pandas dataframe
#First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
#Now let's create the new dataframe and display the top 20 venues for each neighborhood.
num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cities_venues_sorted = pd.DataFrame(columns=columns)
cities_venues_sorted['City'] = all_grouped['City']

for ind in np.arange(all_grouped.shape[0]):
    cities_venues_sorted.iloc[ind, 1:] = return_most_common_venues(all_grouped.iloc[ind, :], num_top_venues)

cities_venues_sorted

In [None]:
#Run k-means to cluster the neighborhood into 5 clusters.
# set number of clusters
kclusters = 5

all_grouped_clustering = all_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(all_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 


In [None]:
#Let's create a new dataframe that includes the cluster as well as the top 20 venues for each neighborhood.
# add clustering labels


#cities_venues_sorted.drop('Clusterlabel', axis=1, inplace=True)
#cities_venues_sorted.insert(0, 'Cluster label', kmeans.labels_)
cities_venues_sorted['Cluster label']=kmeans.labels_
all_merged = all_cities

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
#mn_merged.drop('Clusterlabel', axis=1, inplace=True)
all_merged = all_merged.join(cities_venues_sorted.set_index('City'), on='City')
#all_merged.drop('Cluster label', axis=1, inplace=True)
all_merged.head(6) # check the last columns!


In [None]:
for ii in range(5):
    num = all_merged.loc[all_merged['Cluster label'] == ii].shape[0]
    print('Total number of neighborhoods in cluster {} is {}'.format(ii, num))

In [None]:
#cluster 0
all_merged.loc[all_merged['Cluster label'] == 0]

In [None]:
#cluster 1
all_merged.loc[all_merged['Cluster label'] == 1]

In [None]:
#cluster 2
all_merged.loc[all_merged['Cluster label'] == 2]

In [None]:
all_merged.loc[all_merged['Cluster label'] == 3]

In [None]:
#cluster 4
all_merged.loc[all_merged['Cluster label'] == 4]

In [None]:
#Finally, let's visualize the resulting clusters
map_clusters = folium.Map(location=[latitudemn, longitudemn], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(all_merged['Latitude'], all_merged['Longitude'], all_merged['City'], all_merged['Cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[np.int(cluster)-1],
        fill=True,
        fill_color=rainbow[np.int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
#Finally, let's visualize the resulting clusters
map_clusters = folium.Map(location=[latitudetj, longitudetj], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(all_merged['Latitude'], all_merged['Longitude'], all_merged['City'], all_merged['Cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[np.int(cluster)-1],
        fill=True,
        fill_color=rainbow[np.int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Report link

#### https://xiaojunfu.blogspot.com/2020/06/where-to-live-after-retirement.html