In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import geocoder
import folium
from geopy.geocoders import Nominatim # convert an address to langtitude and longitude
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# import k-means from clustering stage
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import math

In [2]:
url = "https://vi.wikipedia.org/wiki/Th%C3%A0nh_ph%E1%BB%91_H%E1%BB%93_Ch%C3%AD_Minh"

r  = requests.get(url)

data = r.text

soup = BeautifulSoup(data,'lxml')

In [3]:
df = []
for i in soup.find_all("table",class_="wikitable sortable")[0].find_all("tr"):
    ls = []
    for j in i.find_all("td"):
        ls.append(j.get_text())
    df.append(ls)
df = pd.DataFrame(df)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,,,,,,,,,,,...,,,,,,,,,,
1,\n\n\nTên\n\nDiện tích (km²)\nDân số (người)\n...,Quận (19)\n,Quận 1\n,772,142.0,10 phường\n,Quận 2\n,4979.0,180.0,11 phường\n,...,"1 thị trấn, 20 xã\n",Hóc Môn\n,10917.0,542.0,"1 thị trấn, 11 xã\n",Nhà Bè\n,10043.0,206.0,"1 thị trấn, 6 xã\n",\n
2,,,,,,,,,,,...,,,,,,,,,,
3,Quận (19)\n,,,,,,,,,,...,,,,,,,,,,
4,Quận 1\n,772,142.000,10 phường\n,,,,,,,...,,,,,,,,,,


In [5]:
df.drop(labels=[0,1,2,3,16,24],inplace=True)
df = df.iloc[:,0:3]
df.columns=['District Name','Acreage (Km2)','Population']
df.reset_index(drop=True,inplace=True)
# Remove EOL "\n" String in dataset
for i in df.columns:
    df[i] = df[i].str.replace("\n","",regex=True)
# Add an district_id columns to verify district
df['District_id'] = df.index
# Change columns order ['District_id','District Name','Acreage (Km2)','Population']
df = df[['District_id','District Name','Acreage (Km2)','Population']]

In [6]:
df['Acreage (Km2)'] = df['Acreage (Km2)'].str.replace(",",".",regex=True)
df['Population'] = df['Population'].str.replace(".","",regex=True)
df['Acreage (Km2)'].astype(float)
df['Population'].astype(int)

0     142000
1     180000
2     190000
3     175000
4     159000
5     233000
6     360000
7     424000
8     397000
9     234000
10    209000
11    620000
12    784000
13    499000
14    676000
15    163000
16    474000
17    485000
18    592000
19    705000
20     71000
21    462000
22    542000
23    206000
Name: Population, dtype: int32

In [7]:
df.head()

Unnamed: 0,District_id,District Name,Acreage (Km2),Population
0,0,Quận 1,7.72,142000
1,1,Quận 2,49.79,180000
2,2,Quận 3,4.92,190000
3,3,Quận 4,4.18,175000
4,4,Quận 5,4.27,159000


In [8]:
df.shape

(24, 4)

In [9]:
df_hcmc = df.copy()

### Prepare for fourquare dataframe

In [10]:
df_hcm_cor = pd.read_csv("HCM district coordinates.csv")

In [11]:
df_hcm_cor.head()

Unnamed: 0,District Name,Latitude,Longitude
0,Quận 1,10.777369,106.696646
1,Quận 2,10.782377,106.754713
2,Quận 3,10.780681,106.680866
3,Quận 4,10.758388,106.702021
4,Quận 5,10.755418,106.667333


In [12]:
df_hcm_cor = df_hcm_cor.sort_values(by="District Name",ascending=True)
df_hcmc = df_hcmc.sort_values(by="District Name",ascending=True)

In [13]:
df_hcm_cor['District Name'] == df_hcmc['District Name']

19    True
13    True
12    True
20    True
21    True
14    True
22    True
23    True
15    True
0     True
9     True
10    True
11    True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
18    True
16    True
17    True
Name: District Name, dtype: bool

In [14]:
df_hcmc = df_hcmc.merge(df_hcm_cor.iloc[:,1:3],left_index=True,right_index=True)

In [15]:
df_hcmc

Unnamed: 0,District_id,District Name,Acreage (Km2),Population,Latitude,Longitude
19,19,Bình Chánh,252.56,705000,10.724583,106.575197
13,13,Bình Thạnh,20.78,499000,10.812639,106.714579
12,12,Bình Tân,52.02,784000,10.770324,106.599978
20,20,Cần Giờ,704.45,71000,10.535622,106.854503
21,21,Củ Chi,434.77,462000,11.008502,106.518123
14,14,Gò Vấp,19.73,676000,10.837427,106.666492
22,22,Hóc Môn,109.17,542000,10.891521,106.600013
23,23,Nhà Bè,100.43,206000,10.650182,106.729357
15,15,Phú Nhuận,4.88,163000,10.799939,106.677777
0,0,Quận 1,7.72,142000,10.777369,106.696646


In [16]:
address = 'Ho Chi Minh'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Ho Chi Minh City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Ho Chi Minh City are 10.7758439, 106.7017555.


In [17]:
m = folium.Map(location=[latitude, longitude],zoom_start=12)
#url = 'https://cocl.us/sanfran_geojson'
#geo_data_input = f'{url}'
for index,values in df_hcmc.iterrows():
    folium.CircleMarker(
        [values['Latitude'], values['Longitude']],
        radius=2,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(m)  
m

In [18]:
CLIENT_ID = 'TYNGSEF0SNXTWZU5JEPMP3Z3BSCBA2TQEK2ODCGA114NG4GG' # your Foursquare ID
CLIENT_SECRET = 'G21UECLTFUK3MLIFP4SQK03OR1JMMDI4EPQS4J12TF0CFMP3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TYNGSEF0SNXTWZU5JEPMP3Z3BSCBA2TQEK2ODCGA114NG4GG
CLIENT_SECRET:G21UECLTFUK3MLIFP4SQK03OR1JMMDI4EPQS4J12TF0CFMP3


In [19]:
# type your answer here
def get_foursquare_url(cl_id,cl_se,ver,lat,long,radius,category):
    url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&intent=browse&limit=10000'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        long, 
        radius,
        category,
        )
    return url

In [20]:
def getNearbyVenues(district_id,names,population, latitudes, longitudes, acreage,category_type,category):
    CLIENT_ID = 'TYNGSEF0SNXTWZU5JEPMP3Z3BSCBA2TQEK2ODCGA114NG4GG' # your Foursquare ID
    CLIENT_SECRET = 'G21UECLTFUK3MLIFP4SQK03OR1JMMDI4EPQS4J12TF0CFMP3' # your Foursquare Secret
    VERSION = 20200304
    venues_list=[]
    for dis_id,name,pop, lat, lng,acr in zip(district_id,names,population, latitudes, longitudes, acreage):
        print(name)
            
        # create the API request URL
        url = get_foursquare_url(CLIENT_ID,CLIENT_SECRET,VERSION,lat,lng,int(math.sqrt(float(acr)/float(3.14))*1000),category)
            
        # make the GET request
        results = requests.get(url).json()['response']['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            dis_id,
            name,
            pop,
            acr,
            lat, 
            lng, 
            v['id'],
            v['name'],
            v['location']['lat'], 
            v['location']['lng'],
            category_type
        ) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District Id','District Name',
                  'District Population',
                  'District Acreage (Km2)',
                  'District Latitude', 
                  'District Longitude',
                  'Venue Id',
                  'Venue Name',
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category'
                ]
    
    return(nearby_venues)

In [21]:
coffee_category = '4bf58dd8d48988d1e0931735,5665c7b9498e7d8a4f2c0f06,5e18993feee47d000759b256,4bf58dd8d48988d1a1941735,4bf58dd8d48988d128941735,4bf58dd8d48988d16d941735,54135bf5e4b08f3d2429dfe7,56aa371be4b08b9a8d573508,54f4ba06498e2cf5561da814,4bf58dd8d48988d18d941735,4bf58dd8d48988d1f0941735'
coffee_venues = getNearbyVenues(district_id = df_hcmc['District_id'],names=df_hcmc['District Name'],population = df_hcmc['Population'],latitudes = df_hcmc['Latitude'],longitudes = df_hcmc['Longitude'],acreage = df_hcmc['Acreage (Km2)'],category_type="Coffee Shop",category=coffee_category)

Bình Chánh
Bình Thạnh
Bình Tân
Cần Giờ
Củ Chi
Gò Vấp
Hóc Môn
Nhà Bè
Phú Nhuận
Quận 1
Quận 10
Quận 11
Quận 12
Quận 2
Quận 3
Quận 4
Quận 5
Quận 6
Quận 7
Quận 8
Quận 9
Thủ Đức
Tân Bình
Tân Phú


In [22]:
coffee_venues

Unnamed: 0,District Id,District Name,District Population,District Acreage (Km2),District Latitude,District Longitude,Venue Id,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,19,Bình Chánh,705000,252.56,10.724583,106.575197,5e5240407f39440008eba736,Cao Coffee,10.787477,106.623610,Coffee Shop
1,19,Bình Chánh,705000,252.56,10.724583,106.575197,5b6d5a08121384002c083f89,passio,10.749037,106.653464,Coffee Shop
2,19,Bình Chánh,705000,252.56,10.724583,106.575197,52f79dff11d2ba07c640abd2,Miền Thảo Mộc Café,10.763035,106.645140,Coffee Shop
3,19,Bình Chánh,705000,252.56,10.724583,106.575197,5a916d52588e361c5d177eef,Jardin De Cafe,10.713166,106.611015,Coffee Shop
4,19,Bình Chánh,705000,252.56,10.724583,106.575197,5889f35692789f233348cc4f,The Coffee House (178 Hậu Giang),10.749859,106.643630,Coffee Shop
...,...,...,...,...,...,...,...,...,...,...,...
1035,17,Tân Phú,485000,15.97,10.790275,106.628605,5674bcb1498eafd410f67b78,Effoc Au Co,10.780876,106.643685,Coffee Shop
1036,17,Tân Phú,485000,15.97,10.790275,106.628605,58b8fd4bdebdf608b204e29d,CityGARDENCafe,10.797878,106.641813,Coffee Shop
1037,17,Tân Phú,485000,15.97,10.790275,106.628605,5137604ae4b0cff30ae71c58,Cafe Kem Phu Cuong - Chung Cu Bau Cat 2,10.785528,106.644642,Coffee Shop
1038,17,Tân Phú,485000,15.97,10.790275,106.628605,5323b20c11d2013f95595850,Cafe Cóc,10.795633,106.631691,Coffee Shop


### Explain


---
    We will seperate all venues to 2 category:
        1. Other food services, Entertainment services: Restaurant, Park,Zoo,Cinema... Provide short term customer around these areas
        2. State public services,Industrial park, manufacturing area, employment area, School, hospital, medical services, House resident... Provide long term customer around these areas

---

In [23]:
# define and  get long term services from foursquare category
Long_term_services_category = '4bf58dd8d48988d126941735,4bf58dd8d48988d172941735,4d4b7105d754a06379d81259,4bf58dd8d48988d130941735,56aa371be4b08b9a8d573517,4bf58dd8d48988d1ff931735,56aa371be4b08b9a8d5734d7,5744ccdfe4b0c0459246b4d6,4d4b7105d754a06372d81259,4bf58dd8d48988d12f941735,4bf58dd8d48988d104941735,4bf58dd8d48988d13b941735,52e81612bcbc57f1066b7a36,4e67e38e036454776db1fb3a'
Short_term_services_category = '4d4b7105d754a06374d81259,4bf58dd8d48988d17f941735,4bf58dd8d48988d181941735,4bf58dd8d48988d1e5931735,4bf58dd8d48988d137941735,4bf58dd8d48988d184941735,4bf58dd8d48988d193941735,4bf58dd8d48988d17b941735,4d4b7105d754a06376d81259,4d4b7105d754a06377d81259,4bf58dd8d48988d131941735,4d4b7105d754a06378d81259'

In [24]:
# get long term services from foursquare
long_term_df = getNearbyVenues(district_id = df_hcmc['District_id'],names=df_hcmc['District Name'],population = df_hcmc['Population'],latitudes = df_hcmc['Latitude'],longitudes = df_hcmc['Longitude'],acreage = df_hcmc['Acreage (Km2)'],category_type = "Long Term Servicecs",category = Long_term_services_category)

Bình Chánh
Bình Thạnh
Bình Tân
Cần Giờ
Củ Chi
Gò Vấp
Hóc Môn
Nhà Bè
Phú Nhuận
Quận 1
Quận 10
Quận 11
Quận 12
Quận 2
Quận 3
Quận 4
Quận 5
Quận 6
Quận 7
Quận 8
Quận 9
Thủ Đức
Tân Bình
Tân Phú


In [25]:
long_term_df.head()

Unnamed: 0,District Id,District Name,District Population,District Acreage (Km2),District Latitude,District Longitude,Venue Id,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,19,Bình Chánh,705000,252.56,10.724583,106.575197,4bda3c362a3a0f47bd78aab6,Ben Xe Mien Tay,10.740855,106.619255,Long Term Servicecs
1,19,Bình Chánh,705000,252.56,10.724583,106.575197,5be7ae22cbcdee002c21d406,The Pegasuite,10.734,106.654137,Long Term Servicecs
2,19,Bình Chánh,705000,252.56,10.724583,106.575197,5cbc201461e53b002c29304c,RichStar Residence Zone 2,10.775657,106.621075,Long Term Servicecs
3,19,Bình Chánh,705000,252.56,10.724583,106.575197,50bc001ee4b0f4ad3dadd7c8,Bưu điện Bình Hưng,10.721716,106.655857,Long Term Servicecs
4,19,Bình Chánh,705000,252.56,10.724583,106.575197,4ff18120e4b05e4a2f44846d,Phòng khám nhi khoa Bác Sỹ Tuấn,10.758918,106.640842,Long Term Servicecs


In [26]:
short_term_df = getNearbyVenues(district_id = df_hcmc['District_id'],names=df_hcmc['District Name'],population = df_hcmc['Population'],latitudes = df_hcmc['Latitude'],longitudes = df_hcmc['Longitude'],acreage = df_hcmc['Acreage (Km2)'],category_type = "Short Term Servicecs",category = Short_term_services_category)

Bình Chánh
Bình Thạnh
Bình Tân
Cần Giờ
Củ Chi
Gò Vấp
Hóc Môn
Nhà Bè
Phú Nhuận
Quận 1
Quận 10
Quận 11
Quận 12
Quận 2
Quận 3
Quận 4
Quận 5
Quận 6
Quận 7
Quận 8
Quận 9
Thủ Đức
Tân Bình
Tân Phú


In [27]:
short_term_df.head()

Unnamed: 0,District Id,District Name,District Population,District Acreage (Km2),District Latitude,District Longitude,Venue Id,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,19,Bình Chánh,705000,252.56,10.724583,106.575197,5e5240407f39440008eba736,Cao Coffee,10.787477,106.62361,Short Term Servicecs
1,19,Bình Chánh,705000,252.56,10.724583,106.575197,4c5e7c5085a1e21e316e5c11,Chợ Bình Tây,10.749798,106.650974,Short Term Servicecs
2,19,Bình Chánh,705000,252.56,10.724583,106.575197,576836e8498ea0ca13a33063,AEON Mall Bình Tân,10.742904,106.611836,Short Term Servicecs
3,19,Bình Chánh,705000,252.56,10.724583,106.575197,5d7dac2d07420c00084ef387,Gym Center @ Richstar Residence 2,10.772143,106.624157,Short Term Servicecs
4,19,Bình Chánh,705000,252.56,10.724583,106.575197,4c70ab359375a093c4f80737,Co.opMart Phu Lam,10.754266,106.633904,Short Term Servicecs


In [28]:
# Let merge 3 dataframes into 1 dataframe
coffee_venues = coffee_venues.append(long_term_df,ignore_index=True)
coffee_venues = coffee_venues.append(short_term_df,ignore_index=True)

In [29]:
coffee_venues.shape

(3150, 11)

In [30]:
coffee_venues['Venue Category'].value_counts()

Long Term Servicecs     1061
Short Term Servicecs    1049
Coffee Shop             1040
Name: Venue Category, dtype: int64

In [31]:
category_df = coffee_venues.copy()

In [32]:
# export as csv file
category_df.to_csv("df_hcm.csv")

### Analyze

In [33]:
category_df

Unnamed: 0,District Id,District Name,District Population,District Acreage (Km2),District Latitude,District Longitude,Venue Id,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,19,Bình Chánh,705000,252.56,10.724583,106.575197,5e5240407f39440008eba736,Cao Coffee,10.787477,106.623610,Coffee Shop
1,19,Bình Chánh,705000,252.56,10.724583,106.575197,5b6d5a08121384002c083f89,passio,10.749037,106.653464,Coffee Shop
2,19,Bình Chánh,705000,252.56,10.724583,106.575197,52f79dff11d2ba07c640abd2,Miền Thảo Mộc Café,10.763035,106.645140,Coffee Shop
3,19,Bình Chánh,705000,252.56,10.724583,106.575197,5a916d52588e361c5d177eef,Jardin De Cafe,10.713166,106.611015,Coffee Shop
4,19,Bình Chánh,705000,252.56,10.724583,106.575197,5889f35692789f233348cc4f,The Coffee House (178 Hậu Giang),10.749859,106.643630,Coffee Shop
...,...,...,...,...,...,...,...,...,...,...,...
3145,17,Tân Phú,485000,15.97,10.790275,106.628605,538aa07e498ef93b52f9ac7c,Chợ Bà Hoa,10.790870,106.647187,Short Term Servicecs
3146,17,Tân Phú,485000,15.97,10.790275,106.628605,591d40429b047319cb3f01cb,The Coffee House,10.792879,106.628855,Short Term Servicecs
3147,17,Tân Phú,485000,15.97,10.790275,106.628605,5d1cb037b55cb800236a34b5,Kohnan Japan,10.801722,106.617333,Short Term Servicecs
3148,17,Tân Phú,485000,15.97,10.790275,106.628605,5dd68ad1634ada0008511628,Chè 62 Cây Keo,10.771338,106.629880,Short Term Servicecs


In [34]:
(category_df[category_df['District Id']==1])['Venue Category'].value_counts()

Short Term Servicecs    50
Long Term Servicecs     31
Coffee Shop             20
Name: Venue Category, dtype: int64

In [35]:
columns_name = ['District Id','District Name','District Population','District Acreage (Km2)','District Latitude','District Longitude','Num_Of_Coffee','Num_Of_ShortTerm','Num_Of_LongTerm']
new_df = pd.DataFrame(data=None,columns=columns_name)
for i in category_df['District Id'].unique():
    tm_1 = category_df[category_df['District Id']==i]
    tm = (category_df[category_df['District Id']==i])['Venue Category'].value_counts()
    #print(type(pd.DataFrame(columns = columns_name,data = [[tm_1.iloc[0,0],tm_1.iloc[0,1],tm_1.iloc[0,2],tm_1.iloc[0,3],tm_1.iloc[0,4],tm_1.iloc[0,5],tm['Coffee Shop'],tm['Short Term Servicecs'],tm['Long Term Servicecs']]])))
    new_df = new_df.append(pd.DataFrame(columns = columns_name,data = [[tm_1.iloc[0,0],tm_1.iloc[0,1],tm_1.iloc[0,2],tm_1.iloc[0,3],tm_1.iloc[0,4],tm_1.iloc[0,5],tm['Coffee Shop'],tm['Short Term Servicecs'],tm['Long Term Servicecs']]]),ignore_index=True)

In [36]:
new_df

Unnamed: 0,District Id,District Name,District Population,District Acreage (Km2),District Latitude,District Longitude,Num_Of_Coffee,Num_Of_ShortTerm,Num_Of_LongTerm
0,19,Bình Chánh,705000,252.56,10.724583,106.575197,50,17,50
1,13,Bình Thạnh,499000,20.78,10.812639,106.714579,18,50,18
2,12,Bình Tân,784000,52.02,10.770324,106.599978,50,50,50
3,20,Cần Giờ,71000,704.45,10.535622,106.854503,2,42,27
4,21,Củ Chi,462000,434.77,11.008502,106.518123,40,50,50
5,14,Gò Vấp,676000,19.73,10.837427,106.666492,50,16,24
6,22,Hóc Môn,542000,109.17,10.891521,106.600013,50,50,50
7,23,Nhà Bè,206000,100.43,10.650182,106.729357,22,37,50
8,15,Phú Nhuận,163000,4.88,10.799939,106.677777,50,50,50
9,0,Quận 1,142000,7.72,10.777369,106.696646,50,50,50
