# Clustering Postal Code areas in Haiti

_By **Grégory PINCHINAT**_

July 17, 2020

## 0. Packages

Import necessary packages

In [1]:
import pandas as pd # To manage dataframes
pd.set_option('display.max_rows', None) # Displays all rows of all dataframe tables
pd.set_option('display.max_columns', None) # Display all columns of all dataframe tables

import numpy as np # To manages matrices and vectors

import folium # For map data visualization

import requests # To handle API requests

from geopy.geocoders import Nominatim # To search for coordinates from addresses

import json # To handle json data (if needed)
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from bs4 import BeautifulSoup # For webscraping

from sklearn.cluster import KMeans # For KMean Clustering

## I. Data Scraping

Download wikipedia page and check download status

In [2]:
# Download page to be scraped
page = requests.get('https://fr.wikipedia.org/wiki/Code_postal_en_Ha%C3%AFti')

# A status code starting with a 2 indicates success, whereas one starting with a 4 or 5 indicates failure
page.status_code

200

In [3]:
# Check results
page.content

b'\n<!DOCTYPE html>\n<html class="client-nojs" lang="fr" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Code postal en Ha\xc3\xafti \xe2\x80\x94 Wikip\xc3\xa9dia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":[",\\t.","\xc2\xa0\\t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","janvier","f\xc3\xa9vrier","mars","avril","mai","juin","juillet","ao\xc3\xbbt","septembre","octobre","novembre","d\xc3\xa9cembre"],"wgRequestId":"87fd8583-a2e7-4b0f-abcd-55001bf0c6a2","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Code_postal_en_Ha\xc3\xafti","wgTitle":"Code postal en Ha\xc3\xafti","wgCurRevisionId":172769008,"wgRevisionId":172769008,"wgArticleId":9791343,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Portail:G\xc3\xa9ographie/Articles li\xc3\xa9s","Portail:\xc3\x

In [4]:
# Create BeautifulSoup instance and render results

soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="fr">
 <head>
  <meta charset="utf-8"/>
  <title>
   Code postal en Haïti — Wikipédia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":[",\t."," \t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","janvier","février","mars","avril","mai","juin","juillet","août","septembre","octobre","novembre","décembre"],"wgRequestId":"87fd8583-a2e7-4b0f-abcd-55001bf0c6a2","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Code_postal_en_Haïti","wgTitle":"Code postal en Haïti","wgCurRevisionId":172769008,"wgRevisionId":172769008,"wgArticleId":9791343,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Portail:Géographie/Articles liés","Portail:Écriture/Articles liés","Portail:Haïti/Articles liés","Portail:Caraïbe/Articles 

In [5]:
# Check elements of the instance

len(list(soup.children))

5

Find all boroughs (Departments) and put them in a list

In [6]:
# Find all elements of class 'mw-headline'
dept = soup.find_all(class_='mw-headline')

# Instantiate empty list
list_dept = []

# Get the text in the class element and append to the empty list above
for dep in dept:
    dep_name = dep.get_text()
    list_dept.append(dep_name)

# Oops! Some elements of the list are not department names, slice for department names only
list_dept = list_dept[3:-4]
list_dept

["Départment de l'Artibonite",
 'Département du Centre',
 'Département de la Grande Anse',
 'Département des Nippes',
 'Département du Nord',
 'Département du Nord-Est',
 'Département du Nord-Ouest',
 'Département de l’Ouest',
 'Département du Sud',
 'Département du Sud-Est']

The neighborhoods needed are embedded in `'ul'` classes, which are basically a frame under a department with a list of neighborhoods (`'li'` classes inside the `'ul'`) belonging to this respective department. We do the same as we did for the Departments

In [7]:
# Using the method 'select', get all classes 'li' of all classes 'ul'
code_dept = soup.select('ul li')

# Oops! Some elements of the list are not 'code + neighborhood' names, slice for these only
code_dept = code_dept[17:-57]
code_dept

# Instantiate empty list of 'code + neighborhoods', get the text from the elements and append them to the list
list_cd = []
for i in range(len(code_dept)):
    list_cd.append(code_dept[i].get_text())
    
list_cd

['HT4110\xa0: Gonaïves',
 'HT4111\xa0: Petite Rivière de l’Artibonite',
 'HT4120\xa0: Ennery',
 "HT4130\xa0: L'Estère",
 'HT4210\xa0: Gros Morne',
 'HT4220\xa0: Terre-Neuve',
 'HT4230\xa0: Anse-Rouge',
 'HT4231\xa0: Sources Chaudes',
 'HT4310\xa0: Saint-Marc',
 'HT4311\xa0: Montrouis',
 'HT4320\xa0: Verrettes',
 'HT4321\xa0: Désarmes',
 'HT4322\xa0: Deschapelles',
 'HT4323\xa0: Liancourt',
 'HT4410\xa0: Dessalines (Marchand-Dessalines)',
 'HT4420\xa0: Petite-Rivière-de-l’Artibonite',
 'HT4421\xa0: Savane à Roches',
 'HT4430\xa0: Grande-Saline',
 'HT4440\xa0: Desdunes',
 'HT4510\xa0: Marmelade',
 'HT4520\xa0: Saint-Michel-de-l’Attalaye',
 'HT5110\xa0: Hinche',
 'HT5111\xa0: Los Palis',
 'HT5120\xa0: Maïssade',
 'HT5121\xa0: Louverture',
 'HT5130\xa0: Thomonde',
 'HT5140\xa0: Cerca-Carvajal',
 'HT5210\xa0: Mirebalais',
 'HT5211\xa0: Dufailly',
 'HT5220\xa0: Saut-d’Eau',
 'HT5230\xa0: Boucan-Carré',
 'HT5231\xa0: Péligre',
 'HT5310\xa0: Lascahobas',
 'HT5320\xa0: Belladère',
 'HT5321\xa0:

Build dataset

In [8]:
# Make dataframe with the above list
df = pd.DataFrame({'Code_city': list_cd})
df.head()

Unnamed: 0,Code_city
0,HT4110 : Gonaïves
1,HT4111 : Petite Rivière de l’Artibonite
2,HT4120 : Ennery
3,HT4130 : L'Estère
4,HT4210 : Gros Morne


In [9]:
# Extract Postal Codes and add them to a separate column
df['Postal Code'] = df['Code_city'].str[:6]
df.head()

Unnamed: 0,Code_city,Postal Code
0,HT4110 : Gonaïves,HT4110
1,HT4111 : Petite Rivière de l’Artibonite,HT4111
2,HT4120 : Ennery,HT4120
3,HT4130 : L'Estère,HT4130
4,HT4210 : Gros Morne,HT4210


In [10]:
# Extract neighborhoods and add them to a separate column
df['Neighborhood'] = df['Code_city'].str[9:]
df.head()

Unnamed: 0,Code_city,Postal Code,Neighborhood
0,HT4110 : Gonaïves,HT4110,Gonaïves
1,HT4111 : Petite Rivière de l’Artibonite,HT4111,Petite Rivière de l’Artibonite
2,HT4120 : Ennery,HT4120,Ennery
3,HT4130 : L'Estère,HT4130,L'Estère
4,HT4210 : Gros Morne,HT4210,Gros Morne


In [11]:
# Drop the initial columns
df.drop(['Code_city'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,Postal Code,Neighborhood
0,HT4110,Gonaïves
1,HT4111,Petite Rivière de l’Artibonite
2,HT4120,Ennery
3,HT4130,L'Estère
4,HT4210,Gros Morne


In [13]:
# Add Department column
df['Department'] =''

In [14]:
df.head()

Unnamed: 0,Postal Code,Neighborhood,Department
0,HT4110,Gonaïves,
1,HT4111,Petite Rivière de l’Artibonite,
2,HT4120,Ennery,
3,HT4130,L'Estère,
4,HT4210,Gros Morne,


In [15]:
df['Department'][df['Postal Code'].str[2]=='4'] = 'Artibonite'

In [16]:
df.head()

Unnamed: 0,Postal Code,Neighborhood,Department
0,HT4110,Gonaïves,Artibonite
1,HT4111,Petite Rivière de l’Artibonite,Artibonite
2,HT4120,Ennery,Artibonite
3,HT4130,L'Estère,Artibonite
4,HT4210,Gros Morne,Artibonite


In [17]:
df['Department'][df['Postal Code'].str[2]=='5'] = 'Centre'
df['Department'][df['Postal Code'].str[2]=='1'] = 'Nord'
df['Department'][df['Postal Code'].str[2]=='2'] = 'Nord-Est'
df['Department'][df['Postal Code'].str[2]=='3'] = 'Nord-Ouest'
df['Department'][df['Postal Code'].str[2]=='6'] = 'Ouest'
df['Department'][df['Postal Code'].str[2]=='8'] = 'Sud'
df['Department'][df['Postal Code'].str[2]=='9'] = 'Sud-Est'
df['Department'][(df['Postal Code'].str[2:4]=='74')|(df['Postal Code'].str[2:4]=='75')] = 'Nippes'

In [18]:
df['Department'][df['Department']==''] = "Grand'Anse"

In [19]:
# Reorder columns
new_col = [df.columns[0], df.columns[-1], df.columns[1]]
print(new_col)

['Postal Code', 'Department', 'Neighborhood']


In [20]:
df = df[new_col]
df.head()

Unnamed: 0,Postal Code,Department,Neighborhood
0,HT4110,Artibonite,Gonaïves
1,HT4111,Artibonite,Petite Rivière de l’Artibonite
2,HT4120,Artibonite,Ennery
3,HT4130,Artibonite,L'Estère
4,HT4210,Artibonite,Gros Morne


Search coordinates with Geopy

In [21]:
list(set(df.Department))

["Grand'Anse",
 'Nord-Ouest',
 'Nord',
 'Nippes',
 'Nord-Est',
 'Artibonite',
 'Sud-Est',
 'Centre',
 'Ouest',
 'Sud']

In [22]:
# Make a list of addresses
addresses = [neighborhood+' '+department+' Haiti' for (neighborhood, department) in zip(df.Neighborhood, df.Department)]
addresses

['Gonaïves Artibonite Haiti',
 'Petite Rivière de l’Artibonite Artibonite Haiti',
 'Ennery Artibonite Haiti',
 "L'Estère Artibonite Haiti",
 'Gros Morne Artibonite Haiti',
 'Terre-Neuve Artibonite Haiti',
 'Anse-Rouge Artibonite Haiti',
 'Sources Chaudes Artibonite Haiti',
 'Saint-Marc Artibonite Haiti',
 'Montrouis Artibonite Haiti',
 'Verrettes Artibonite Haiti',
 'Désarmes Artibonite Haiti',
 'Deschapelles Artibonite Haiti',
 'Liancourt Artibonite Haiti',
 'Dessalines (Marchand-Dessalines) Artibonite Haiti',
 'Petite-Rivière-de-l’Artibonite Artibonite Haiti',
 'Savane à Roches Artibonite Haiti',
 'Grande-Saline Artibonite Haiti',
 'Desdunes Artibonite Haiti',
 'Marmelade Artibonite Haiti',
 'Saint-Michel-de-l’Attalaye Artibonite Haiti',
 'Hinche Centre Haiti',
 'Los Palis Centre Haiti',
 'Maïssade Centre Haiti',
 'Louverture Centre Haiti',
 'Thomonde Centre Haiti',
 'Cerca-Carvajal Centre Haiti',
 'Mirebalais Centre Haiti',
 'Dufailly Centre Haiti',
 'Saut-d’Eau Centre Haiti',
 'Bou

In [23]:
len(addresses) == len(df)

True

In [24]:
# Add latitudes and longitudes columns
df['Nei_lat'] = ''
df['Nei_lng'] = ''

In [25]:
# Get geographical coordinates of Neighborhoods to complete the data
geolocator = Nominatim(user_agent="ht_explorer")

# Loop thru the list
for i in range(len(df)):
    # Adding two layers of try except to handle exception errors that may occur
    try:
        try: 
            location = geolocator.geocode(addresses[i])
            df['Nei_lat'][i] = location.latitude
            df['Nei_lng'][i] = location.longitude
        except: # In case some neighborhood name has a mistake and geopy doesn't recognize it, do this:
            location = geolocator.geocode(df['Neighborhood'][i]+' Haiti')
            df['Nei_lat'][i] = location.latitude
            df['Nei_lng'][i] = location.longitude
    except: # If geopy still can't return a location object, print which address gave the problem and continue looping
        print('Exception occured at address '+addresses[i]+'/ info returned: '+str(geolocator.geocode(addresses[i])))
        

Exception occured at address Lesson Grand'Anse Haiti/ info returned: None
Exception occured at address Bande-du-Nord (quartier du Cap) Nord Haiti/ info returned: None
Exception occured at address Labadie (quartier du Cap) Nord Haiti/ info returned: None
Exception occured at address Haut-du-Cap (quartier du Cap) Nord Haiti/ info returned: None
Exception occured at address Barbon Nord Haiti/ info returned: None
Exception occured at address Trou-duNord Nord-Est Haiti/ info returned: None
Exception occured at address Pointe des Oiseaux Nord-Ouest Haiti/ info returned: None
Exception occured at address ort-au-Prince Ouest Haiti/ info returned: None
Exception occured at address Bel-Air et Bourdon Ouest Haiti/ info returned: None
Exception occured at address Martissant et Fontamara Ouest Haiti/ info returned: None
Exception occured at address Maïs Gâté, Tabarre zone 1 Ouest Haiti/ info returned: None
Exception occured at address Petite Place Cazeau, Tabarre zone 2 Ouest Haiti/ info returned: 

## II. Data Wrangling

We had to compare many data providers to get the missing data. For example, the first exception is about the name "Lesson" which should be "Cesson" in reality, which we found the coordinates on OpenStreetMaps: `(18.5969136, -74.4008307)`. We can trust OpenStreetMaps because the National Statistical Bureau of Haiti (IHSI) has been working with them and using their infrastructure to continuously update national geographical data in Haiti.

In [26]:
# Replace Lesson by Cesson 
print(df['Neighborhood'][df.Neighborhood=='Lesson'])
df['Neighborhood'][df.Neighborhood=='Lesson'] = 'Cesson'

50    Lesson
Name: Neighborhood, dtype: object


In [27]:
# Check the change
print(df['Neighborhood'][df.Neighborhood=='Cesson'])

50    Cesson
Name: Neighborhood, dtype: object


In [28]:
# Insert the coordinates
df['Nei_lat'][df.Neighborhood=='Cesson'] = 18.5969136
df['Nei_lng'][df.Neighborhood=='Cesson'] = -74.4008307

In [29]:
# Check the changes
df[['Neighborhood','Nei_lat', 'Nei_lng']][49:52]

Unnamed: 0,Neighborhood,Nei_lat,Nei_lng
49,Dame-Marie,18.5613,-74.419
50,Cesson,18.5969,-74.4008
51,Les Irois,18.4037,-74.4523


In [30]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
68,HT1111,Nord,Bande-du-Nord (quartier du Cap),,
69,HT1112,Nord,Labadie (quartier du Cap),,
70,HT1113,Nord,Haut-du-Cap (quartier du Cap),,
81,HT1320,Nord,Barbon,,
108,HT2310,Nord-Est,Trou-duNord,,
123,HT3121,Nord-Ouest,Pointe des Oiseaux,,
135,HT6110,Ouest,ort-au-Prince,,
136,HT6111,Ouest,Bel-Air et Bourdon,,
144,HT6119,Ouest,Martissant et Fontamara,,
148,HT6123,Ouest,"Maïs Gâté, Tabarre zone 1",,


In [31]:
location68 = geolocator.geocode('Bande-du-Nord Haiti')
df['Nei_lat'].loc[68] = location68.latitude
df['Nei_lng'].loc[68] = location68.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[68]

Neighborhood    Bande-du-Nord (quartier du Cap)
Nei_lat                                 19.7669
Nei_lng                                -72.2081
Name: 68, dtype: object

In [32]:
location69 = geolocator.geocode('Labadie Haiti')
df['Nei_lat'].loc[69] = location69.latitude
df['Nei_lng'].loc[69] = location69.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[69]

Neighborhood    Labadie (quartier du Cap)
Nei_lat                           18.4021
Nei_lng                           -72.884
Name: 69, dtype: object

In [33]:
df['Nei_lat'].loc[70] = 19.723938
df['Nei_lng'].loc[70] = -72.223021
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[70]

Neighborhood    Haut-du-Cap (quartier du Cap)
Nei_lat                               19.7239
Nei_lng                               -72.223
Name: 70, dtype: object

In [34]:
# Replace Barbon by Balan 
print(df['Neighborhood'][df.Neighborhood=='Barbon'])
df['Neighborhood'][df.Neighborhood=='Barbon'] = 'Balan'

81    Barbon
Name: Neighborhood, dtype: object


In [35]:
# Check the change
print(df['Neighborhood'][df.Neighborhood=='Balan'])

81    Balan
Name: Neighborhood, dtype: object


In [36]:
# Insert the coordinates
df['Nei_lat'][df.Neighborhood=='Balan'] = 19.732037
df['Nei_lng'][df.Neighborhood=='Balan'] = -72.186627

In [37]:
# Check the changes
df[['Neighborhood','Nei_lat', 'Nei_lng']][80:83]

Unnamed: 0,Neighborhood,Nei_lat,Nei_lng
80,Grande-Rivière-du-Nord,19.5778,-72.1693
81,Balan,19.732,-72.1866
82,Saint-Raphaël,19.4393,-72.1994


In [38]:
# Replace Trou-duNord by Trou-du-Nord 
print(df['Neighborhood'][df.Neighborhood=='Trou-duNord'])
df['Neighborhood'][df.Neighborhood=='Trou-duNord'] = 'Trou-du-Nord'

108    Trou-duNord
Name: Neighborhood, dtype: object


In [39]:
location108 = geolocator.geocode('Trou-du-Nord Haiti')
df['Nei_lat'].loc[108] = location108.latitude
df['Nei_lng'].loc[108] = location108.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[108]

Neighborhood    Trou-du-Nord
Nei_lat              19.6168
Nei_lng             -72.0249
Name: 108, dtype: object

In [40]:
# Replace 'Pointe des Oiseaux' by 'Pointe aux Oiseaux (Ile de la Tortue)''
print(df['Neighborhood'][df.Neighborhood=='Pointe des Oiseaux'])
df['Neighborhood'][df.Neighborhood=='Pointe des Oiseaux'] = 'Pointe aux Oiseaux (Ile de la Tortue)'

123    Pointe des Oiseaux
Name: Neighborhood, dtype: object


In [41]:
location123 = geolocator.geocode('Pointe aux Oiseaux Haiti')
df['Nei_lat'].loc[123] = location123.latitude
df['Nei_lng'].loc[123] = location123.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[123]

Neighborhood    Pointe aux Oiseaux (Ile de la Tortue)
Nei_lat                                       20.0133
Nei_lng                                      -72.7593
Name: 123, dtype: object

In [42]:
# Replace 'ort-au-Prince' by 'Port-au-Prince'
print(df['Neighborhood'][df.Neighborhood=='ort-au-Prince'])
df['Neighborhood'][df.Neighborhood=='ort-au-Prince'] = 'Port-au-Prince'

135    ort-au-Prince
Name: Neighborhood, dtype: object


In [43]:
location135 = geolocator.geocode('Pointe aux Oiseaux Haiti')
df['Nei_lat'].loc[135] = location135.latitude
df['Nei_lng'].loc[135] = location135.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[135]

Neighborhood    Port-au-Prince
Nei_lat                20.0133
Nei_lng               -72.7593
Name: 135, dtype: object

Now we will split the table in order to explode the groups of multiple neighborhoods into individual lines with the same respective Postal Codes. We're doing this because we can't find geographical coordinates by postal codes but rather by Neighborhood name.

In [44]:
df.loc[136.5] = df.loc[136].copy()
df[135:138]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
135.0,HT6110,Ouest,Port-au-Prince,20.0133,-72.7593
136.0,HT6111,Ouest,Bel-Air et Bourdon,,
137.0,HT6112,Ouest,Bas Peu de Choses,18.5335,-72.3386
138.0,HT6113,Ouest,Turgeau,18.5338,-72.33


In [45]:
df[df.Neighborhood=='Bel-Air et Bourdon']

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
136.0,HT6111,Ouest,Bel-Air et Bourdon,,
136.5,HT6111,Ouest,Bel-Air et Bourdon,,


In [46]:
df = df.sort_index().reset_index(drop=True)

In [47]:
df.head()

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
0,HT4110,Artibonite,Gonaïves,19.4461,-72.6884
1,HT4111,Artibonite,Petite Rivière de l’Artibonite,19.123,-72.48
2,HT4120,Artibonite,Ennery,19.4846,-72.4849
3,HT4130,Artibonite,L'Estère,19.3032,-72.6102
4,HT4210,Artibonite,Gros Morne,19.6716,-72.6784


In [48]:
df[df.Neighborhood=='Bel-Air et Bourdon']

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
136,HT6111,Ouest,Bel-Air et Bourdon,,
137,HT6111,Ouest,Bel-Air et Bourdon,,


In [49]:
df['Neighborhood'].loc[136] = 'Bel-Air'
df['Neighborhood'].loc[137] = 'Bourdon'

In [50]:
df[135:138]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
135,HT6110,Ouest,Port-au-Prince,20.0133,-72.7593
136,HT6111,Ouest,Bel-Air,,
137,HT6111,Ouest,Bourdon,,


In [51]:
location136 = geolocator.geocode('Bel-Air, Port-au-Prince')
df['Nei_lat'].loc[136] = location136.latitude
df['Nei_lng'].loc[136] = location136.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[136]

Neighborhood    Bel-Air
Nei_lat           18.55
Nei_lng        -72.3362
Name: 136, dtype: object

In [52]:
location137 = geolocator.geocode('Bourdon, Port-au-Prince')
df['Nei_lat'].loc[137] = location137.latitude
df['Nei_lng'].loc[137] = location137.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[137]

Neighborhood    Bourdon
Nei_lat         18.5387
Nei_lng        -72.3154
Name: 137, dtype: object

In [53]:
df[135:138]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
135,HT6110,Ouest,Port-au-Prince,20.0133,-72.7593
136,HT6111,Ouest,Bel-Air,18.55,-72.3362
137,HT6111,Ouest,Bourdon,18.5387,-72.3154


In [54]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
145,HT6119,Ouest,Martissant et Fontamara,,
149,HT6123,Ouest,"Maïs Gâté, Tabarre zone 1",,
150,HT6124,Ouest,"Petite Place Cazeau, Tabarre zone 2",,
155,HT6133,Ouest,"Côte-Page, Mahotière",,
156,HT6134,Ouest,"Arcachon, Warney",,
158,HT6136,Ouest,"Lamentin, Mariani, Merger",,
160,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
166,HT6145,Ouest,Thomassin 32 route Aimé-Bastient imp Laurent,,
168,HT6147,Ouest,Pergnier,,
170,HT6160,Ouest,Gressie,,


In [55]:
df.loc[145.5] = df.loc[145].copy()
df = df.sort_index().reset_index(drop=True)

In [56]:
df[144:147]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
144,HT6118,Ouest,Bolosse,18.5304,-72.3495
145,HT6119,Ouest,Martissant et Fontamara,,
146,HT6119,Ouest,Martissant et Fontamara,,


In [57]:
df['Neighborhood'].loc[145] = 'Martissant'
df['Neighborhood'].loc[146] = 'Fontamara'

In [58]:
location145 = geolocator.geocode('Martissant')
df['Nei_lat'].loc[145] = location145.latitude
df['Nei_lng'].loc[145] = location145.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[145]

Neighborhood    Martissant
Nei_lat            18.5274
Nei_lng            -72.357
Name: 145, dtype: object

In [59]:
location146 = geolocator.geocode('Fontamara')
df['Nei_lat'].loc[146] = location146.latitude
df['Nei_lng'].loc[146] = location146.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[146]

Neighborhood    Fontamara
Nei_lat            18.527
Nei_lng          -72.3741
Name: 146, dtype: object

In [60]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
150,HT6123,Ouest,"Maïs Gâté, Tabarre zone 1",,
151,HT6124,Ouest,"Petite Place Cazeau, Tabarre zone 2",,
156,HT6133,Ouest,"Côte-Page, Mahotière",,
157,HT6134,Ouest,"Arcachon, Warney",,
159,HT6136,Ouest,"Lamentin, Mariani, Merger",,
161,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
167,HT6145,Ouest,Thomassin 32 route Aimé-Bastient imp Laurent,,
169,HT6147,Ouest,Pergnier,,
171,HT6160,Ouest,Gressie,,
175,HT6220,Ouest,PetitGoâve,,


In [61]:
location150 = geolocator.geocode('Maïs Gâté, Tabarre')
df['Nei_lat'].loc[150] = location150.latitude
df['Nei_lng'].loc[150] = location150.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[150]

Neighborhood    Maïs Gâté, Tabarre zone 1
Nei_lat                           18.5552
Nei_lng                          -72.2611
Name: 150, dtype: object

In [62]:
location151 = geolocator.geocode('Petite Place Cazeau, Tabarre')
df['Nei_lat'].loc[151] = location151.latitude
df['Nei_lng'].loc[151] = location151.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[151]

Neighborhood    Petite Place Cazeau, Tabarre zone 2
Nei_lat                                     18.5463
Nei_lng                                    -72.2565
Name: 151, dtype: object

In [63]:
df.loc[156.5] = df.loc[156].copy()
df = df.sort_index().reset_index(drop=True)

In [64]:
df[155:158]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
155,HT6132,Ouest,"Diquini, Thor",18.5328,-72.3875
156,HT6133,Ouest,"Côte-Page, Mahotière",,
157,HT6133,Ouest,"Côte-Page, Mahotière",,


In [65]:
df['Neighborhood'].loc[156] = 'Côte-Plage'
df['Neighborhood'].loc[157] = 'Mahotière'

In [66]:
location156 = geolocator.geocode('Côte-Plage, Carrefour, Haiti')
df['Nei_lat'].loc[156] = location156.latitude
df['Nei_lng'].loc[156] = location156.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[156]

Neighborhood    Côte-Plage
Nei_lat            18.5376
Nei_lng           -72.3933
Name: 156, dtype: object

In [67]:
location157 = geolocator.geocode('Mahotière, Carrefour, Haiti')
df['Nei_lat'].loc[157] = location157.latitude
df['Nei_lng'].loc[157] = location157.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[157]

Neighborhood    Mahotière
Nei_lat           18.5283
Nei_lng          -72.3963
Name: 157, dtype: object

In [68]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
158,HT6134,Ouest,"Arcachon, Warney",,
160,HT6136,Ouest,"Lamentin, Mariani, Merger",,
162,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
168,HT6145,Ouest,Thomassin 32 route Aimé-Bastient imp Laurent,,
170,HT6147,Ouest,Pergnier,,
172,HT6160,Ouest,Gressie,,
176,HT6220,Ouest,PetitGoâve,,
184,HT6331,Ouest,Fonds Parisien et Galette Bonbon,,
190,HT6421,Ouest,asale,,
192,HT6520,Ouest,Pointe à Raquette (La Gonâve),,


In [69]:
df.loc[158.5] = df.loc[158].copy()
df = df.sort_index().reset_index(drop=True)

In [70]:
df[157:160]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
157,HT6133,Ouest,Mahotière,18.5283,-72.3963
158,HT6134,Ouest,"Arcachon, Warney",,
159,HT6134,Ouest,"Arcachon, Warney",,


In [71]:
df['Neighborhood'].loc[158] = 'Arcachon'
df['Neighborhood'].loc[159] = 'Waney'

In [72]:
location158 = geolocator.geocode('Arcachon, Carrefour, Haiti')
df['Nei_lat'].loc[158] = location158.latitude
df['Nei_lng'].loc[158] = location158.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[158]

Neighborhood    Arcachon
Nei_lat          18.5399
Nei_lng         -72.4001
Name: 158, dtype: object

In [73]:
location159 = geolocator.geocode('Waney, Carrefour, Haiti')
df['Nei_lat'].loc[159] = location159.latitude
df['Nei_lng'].loc[159] = location159.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[159]

Neighborhood      Waney
Nei_lat         18.5328
Nei_lng        -72.4046
Name: 159, dtype: object

In [74]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
161,HT6136,Ouest,"Lamentin, Mariani, Merger",,
163,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
169,HT6145,Ouest,Thomassin 32 route Aimé-Bastient imp Laurent,,
171,HT6147,Ouest,Pergnier,,
173,HT6160,Ouest,Gressie,,
177,HT6220,Ouest,PetitGoâve,,
185,HT6331,Ouest,Fonds Parisien et Galette Bonbon,,
191,HT6421,Ouest,asale,,
193,HT6520,Ouest,Pointe à Raquette (La Gonâve),,
201,HT8160,Sud,L’Ïle à Vaches,,


In [75]:
df.loc[161.1] = df.loc[161].copy()
df.loc[161.9] = df.loc[161].copy()

df = df.sort_index().reset_index(drop=True)

In [76]:
df[160:165]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
160,HT6135,Ouest,Brochette,18.5316,-72.4154
161,HT6136,Ouest,"Lamentin, Mariani, Merger",,
162,HT6136,Ouest,"Lamentin, Mariani, Merger",,
163,HT6136,Ouest,"Lamentin, Mariani, Merger",,
164,HT6140,Ouest,Pétion-Ville,18.5169,-72.2835


In [77]:
df['Neighborhood'].loc[161] = 'Lamentin'
df['Neighborhood'].loc[162] = 'Mariani'
df['Neighborhood'].loc[163] = 'Merger'

In [78]:
location161 = geolocator.geocode('Lamentin, Carrefour, Haiti')
df['Nei_lat'].loc[161] = location161.latitude
df['Nei_lng'].loc[161] = location161.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[161]

Neighborhood    Lamentin
Nei_lat          18.5523
Nei_lng         -72.4095
Name: 161, dtype: object

In [79]:
location162 = geolocator.geocode('Mariani, Carrefour, Haiti')
df['Nei_lat'].loc[162] = location162.latitude
df['Nei_lng'].loc[162] = location162.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[162]

Neighborhood    Mariani
Nei_lat         18.5404
Nei_lng         -72.431
Name: 162, dtype: object

In [80]:
location163 = geolocator.geocode('Merger, Carrefour, Haiti')
df['Nei_lat'].loc[163] = location163.latitude
df['Nei_lng'].loc[163] = location163.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[163]

Neighborhood     Merger
Nei_lat         18.5437
Nei_lng        -72.4753
Name: 163, dtype: object

In [81]:
df[160:165]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
160,HT6135,Ouest,Brochette,18.5316,-72.4154
161,HT6136,Ouest,Lamentin,18.5523,-72.4095
162,HT6136,Ouest,Mariani,18.5404,-72.431
163,HT6136,Ouest,Merger,18.5437,-72.4753
164,HT6140,Ouest,Pétion-Ville,18.5169,-72.2835


In [82]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
165,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
171,HT6145,Ouest,Thomassin 32 route Aimé-Bastient imp Laurent,,
173,HT6147,Ouest,Pergnier,,
175,HT6160,Ouest,Gressie,,
179,HT6220,Ouest,PetitGoâve,,
187,HT6331,Ouest,Fonds Parisien et Galette Bonbon,,
193,HT6421,Ouest,asale,,
195,HT6520,Ouest,Pointe à Raquette (La Gonâve),,
203,HT8160,Sud,L’Ïle à Vaches,,
222,HT8531,Sud,Cahonane,,


In [83]:
df.loc[165.5] = df.loc[165].copy()
df = df.sort_index().reset_index(drop=True)

In [84]:
df[164:170]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
164,HT6140,Ouest,Pétion-Ville,18.5169,-72.2835
165,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
166,HT6141,Ouest,"Frères, pergnier,peguy-ville",,
167,HT6123,Ouest,Tabarre (zone1),18.5528,-72.2593
168,HT6124,Ouest,Tabarre (zone2),18.5528,-72.2593
169,HT6142,Ouest,Pèlerin,18.9705,-72.4121


In [85]:
df['Neighborhood'].loc[165] = 'Frères'
df['Neighborhood'].loc[166] = 'Péguy-Ville'

In [86]:
location165 = geolocator.geocode('Frères, Haiti')
df['Nei_lat'].loc[165] = location165.latitude
df['Nei_lng'].loc[165] = location165.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[165]

Neighborhood     Frères
Nei_lat          18.526
Nei_lng        -72.2754
Name: 165, dtype: object

In [87]:
location166 = geolocator.geocode('Péguy-ville Haiti')
df['Nei_lat'].loc[166] = location166.latitude
df['Nei_lng'].loc[166] = location166.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[166]

Neighborhood    Péguy-Ville
Nei_lat             18.5145
Nei_lng            -72.2719
Name: 166, dtype: object

In [88]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
172,HT6145,Ouest,Thomassin 32 route Aimé-Bastient imp Laurent,,
174,HT6147,Ouest,Pergnier,,
176,HT6160,Ouest,Gressie,,
180,HT6220,Ouest,PetitGoâve,,
188,HT6331,Ouest,Fonds Parisien et Galette Bonbon,,
194,HT6421,Ouest,asale,,
196,HT6520,Ouest,Pointe à Raquette (La Gonâve),,
204,HT8160,Sud,L’Ïle à Vaches,,
223,HT8531,Sud,Cahonane,,


In [89]:
df['Neighborhood'].loc[172] = 'Thomassin'

In [90]:
location172 = geolocator.geocode('Thomassin Haiti')
df['Nei_lat'].loc[172] = location172.latitude
df['Nei_lng'].loc[172] = location172.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[172]

Neighborhood    Thomassin
Nei_lat           18.4807
Nei_lng           -72.312
Name: 172, dtype: object

In [91]:
df['Neighborhood'].loc[174] = 'Pernier'

In [92]:
location174 = geolocator.geocode('Pernier Haiti')
df['Nei_lat'].loc[174] = location174.latitude
df['Nei_lng'].loc[174] = location174.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[174]

Neighborhood    Pernier
Nei_lat         18.5174
Nei_lng         -72.237
Name: 174, dtype: object

In [93]:
df['Neighborhood'].loc[176] = 'Gressier'

In [94]:
location176 = geolocator.geocode('Gressier Haiti')
df['Nei_lat'].loc[176] = location176.latitude
df['Nei_lng'].loc[176] = location176.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[176]

Neighborhood    Gressier
Nei_lat          18.5427
Nei_lng          -72.523
Name: 176, dtype: object

In [95]:
df['Neighborhood'].loc[180] = 'Petit Goâve'

In [96]:
location180 = geolocator.geocode('Petit Goave Haiti')
df['Nei_lat'].loc[180] = location180.latitude
df['Nei_lng'].loc[180] = location180.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[180]

Neighborhood    Petit Goâve
Nei_lat             18.4327
Nei_lng            -72.8654
Name: 180, dtype: object

In [97]:
df.loc[188.5] = df.loc[188].copy()
df = df.sort_index().reset_index(drop=True)

In [98]:
df[187:191]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
187,HT6330,Ouest,Ganthier,18.532,-72.0679
188,HT6331,Ouest,Fonds Parisien et Galette Bonbon,,
189,HT6331,Ouest,Fonds Parisien et Galette Bonbon,,
190,HT6330,Ouest,Cornillon,18.6752,-71.9521


In [99]:
df['Neighborhood'].loc[188] = 'Fonds Parisien'
df['Neighborhood'].loc[189] = 'Galette Bonbon'

In [100]:
location188 = geolocator.geocode('Fonds Parisien Haiti')
df['Nei_lat'].loc[188] = location188.latitude
df['Nei_lng'].loc[188] = location188.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[188]

Neighborhood    Fonds Parisien
Nei_lat                18.5072
Nei_lng               -71.9773
Name: 188, dtype: object

In [101]:
location189 = geolocator.geocode('Galette Haiti')
df['Nei_lat'].loc[189] = location189.latitude
df['Nei_lng'].loc[189] = location189.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[189]

Neighborhood    Galette Bonbon
Nei_lat                18.5401
Nei_lng               -72.2317
Name: 189, dtype: object

In [102]:
df[(df.Nei_lat=='')|(df.Nei_lng=='')]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
195,HT6421,Ouest,asale,,
197,HT6520,Ouest,Pointe à Raquette (La Gonâve),,
205,HT8160,Sud,L’Ïle à Vaches,,
224,HT8531,Sud,Cahonane,,


In [103]:
df['Neighborhood'].loc[195] = 'Casale'

In [104]:
location195 = geolocator.geocode('Casale Haiti')
df['Nei_lat'].loc[195] = location195.latitude
df['Nei_lng'].loc[195] = location195.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[195]

Neighborhood     Casale
Nei_lat         18.7988
Nei_lng         -72.385
Name: 195, dtype: object

In [105]:
df['Nei_lat'].loc[197] = 18.798517
df['Nei_lng'].loc[197] = -73.06226
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[197]

Neighborhood    Pointe à Raquette (La Gonâve)
Nei_lat                               18.7985
Nei_lng                              -73.0623
Name: 197, dtype: object

In [106]:
location205 = geolocator.geocode('Ile à Vache Haiti')
df['Nei_lat'].loc[205] = location205.latitude
df['Nei_lng'].loc[205] = location205.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[205]

Neighborhood    L’Ïle à Vaches
Nei_lat                 18.083
Nei_lng               -73.6078
Name: 205, dtype: object

In [107]:
df['Neighborhood'].loc[224] = 'La Cahouane'

In [108]:
df['Nei_lat'].loc[224] = 18.3133
df['Nei_lng'].loc[224] = -74.257
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[224]

Neighborhood    La Cahouane
Nei_lat             18.3133
Nei_lng             -74.257
Name: 224, dtype: object

In [109]:
df

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
0,HT4110,Artibonite,Gonaïves,19.4461,-72.6884
1,HT4111,Artibonite,Petite Rivière de l’Artibonite,19.123,-72.48
2,HT4120,Artibonite,Ennery,19.4846,-72.4849
3,HT4130,Artibonite,L'Estère,19.3032,-72.6102
4,HT4210,Artibonite,Gros Morne,19.6716,-72.6784
5,HT4220,Artibonite,Terre-Neuve,19.6007,-72.7816
6,HT4230,Artibonite,Anse-Rouge,19.632,-73.0551
7,HT4231,Artibonite,Sources Chaudes,19.5889,-72.9021
8,HT4310,Artibonite,Saint-Marc,19.102,-72.6995
9,HT4311,Artibonite,Montrouis,18.9595,-72.6972


In [110]:
df[df['Neighborhood']=='Diquini, Thor']

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
155,HT6132,Ouest,"Diquini, Thor",18.5328,-72.3875


In [111]:
df.loc[155.5] = df.loc[155].copy()
df = df.sort_index().reset_index(drop=True)

In [112]:
df[154:158]

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
154,HT6131,Ouest,Bizoton,18.5304,-72.3808
155,HT6132,Ouest,"Diquini, Thor",18.5328,-72.3875
156,HT6132,Ouest,"Diquini, Thor",18.5328,-72.3875
157,HT6133,Ouest,Côte-Plage,18.5376,-72.3933


In [113]:
df['Neighborhood'].loc[155] = 'Diquini'
df['Neighborhood'].loc[156] = 'Thor'

In [114]:
location155 = geolocator.geocode('Diquini Carrefour Haiti')
df['Nei_lat'].loc[155] = location155.latitude
df['Nei_lng'].loc[155] = location155.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[155]

Neighborhood    Diquini
Nei_lat          18.532
Nei_lng        -72.3913
Name: 155, dtype: object

In [115]:
location156 = geolocator.geocode('Thor Carrefour Haiti')
df['Nei_lat'].loc[156] = location156.latitude
df['Nei_lng'].loc[156] = location156.longitude
df[['Neighborhood','Nei_lat', 'Nei_lng']].loc[156]

Neighborhood       Thor
Nei_lat          18.535
Nei_lng        -72.3896
Name: 156, dtype: object

In [116]:
df.to_csv('haiti_postal_codes.csv')

## III. Exploratory Data Analysis

Check if data types are correct:

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 5 columns):
Postal Code     240 non-null object
Department      240 non-null object
Neighborhood    240 non-null object
Nei_lat         240 non-null object
Nei_lng         240 non-null object
dtypes: object(5)
memory usage: 9.5+ KB


All data types are correct. Now check whether any column has any missing values:

In [118]:
df.isna().any()

Postal Code     False
Department      False
Neighborhood    False
Nei_lat         False
Nei_lng         False
dtype: bool

There are no missing values. Now check whether any column has any null values:

In [119]:
df.isnull().any()

Postal Code     False
Department      False
Neighborhood    False
Nei_lat         False
Nei_lng         False
dtype: bool

There are no null values. Now get the descriptives

In [120]:
df.describe()

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
count,240,240,240,240.0,240.0
unique,227,10,239,235.0,235.0
top,HT6136,Ouest,Sources Chaudes,19.122983,-72.759335
freq,3,64,2,2.0,2.0


There should be 240 neighborhoods. But one is repeated, it is no mistake. There are two neighborhoods that have the same names but with different geographical data, there more than one street and more than one neighborhoods like this in Haiti:

In [121]:
list_neigh = list(df.Neighborhood)
list_neigh.sort()

for item in list_neigh:
    try:
        if list_neigh[list_neigh.index(item)] == list_neigh[(list_neigh.index(item))+1] :
            print(list_neigh[list_neigh.index(item)])
    except:
        print('---End of program---')

Sources Chaudes
Sources Chaudes
---End of program---


In [122]:
df[df.Neighborhood=='Sources Chaudes']

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
7,HT4231,Artibonite,Sources Chaudes,19.5889,-72.9021
46,HT7141,Grand'Anse,Sources Chaudes,18.4829,-74.2874


In [123]:
df[df['Postal Code']=='HT6136']

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng
162,HT6136,Ouest,Lamentin,18.5523,-72.4095
163,HT6136,Ouest,Mariani,18.5404,-72.431
164,HT6136,Ouest,Merger,18.5437,-72.4753


Get the `Venues` from __*Foursquare*__:

In [124]:
# Defining credentials
CLIENT_ID = 'H4MRCQVMABMULWHEQMDDBVPA1JWH4TJGTQAEIABIOYGAR4WZ' # your Foursquare ID
CLIENT_SECRET = 'SYUJJDFYZZC14MCIJPMILOSESH2E3VBI21JSXOY030SUFQTM' # your Foursquare Secret
VERSION = '20180323' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: H4MRCQVMABMULWHEQMDDBVPA1JWH4TJGTQAEIABIOYGAR4WZ
CLIENT_SECRET:SYUJJDFYZZC14MCIJPMILOSESH2E3VBI21JSXOY030SUFQTM


In [125]:
# Function that extracts the category of each venue in every set of maximum 500 foor each neighborhood

def getNearbyVenues(depts, names, latitudes, longitudes, radius=500, limit=500):
    
    # Instantiate an empty list of venues
    venues_list = []
    for dept, name, lat, lng in zip(depts, names, latitudes, longitudes):
        print(name)
        
        #Create Foursquare API request url
        url = ('https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}'
               '&limit={}'.format(CLIENT_ID,
                                  CLIENT_SECRET,
                                  VERSION,
                                  lat,
                                  lng,
                                  radius,
                                  limit))
        
        #Send a get request to Foursquare
        results = requests.get(url).json()
        results = results['response']['groups'][0]['items']
        
        #But we don't need all the items, so for each venue, we will slice for the items needed and append
        #the corresponding items to the 'venues_list'
        #Each item will be appended to the venues_list as a tuple (which represents one venue_list of the venues_list)
        venues_list.append([(
                    dept, 
                    name, #name of the Neighborhood as defined in the beginning of the 'for' loop
                    lat, #latitude of the Neighborhood as defined in the beginning of the 'for' loop
                    lng, #longitude of the Neighborhood as defined in the beginning of the 'for' loop
                    v['venue']['name'], #v is an item from the results as defined above and we get the venue name
                    v['venue']['location']['lat'], #v is the same item from which we get the latitude as and where defined
                    v['venue']['location']['lng'], #v is the same item from which we get the longitude as and where defined
                    v['venue']['categories'][0]['name']) for v in results]) #Repeats the operation for all 100 venues for
                                                                            #the same neighborhood, then goes back to the
                                                                            #beginning of the loop to do the same for
                                                                            #another neighborhood
    
    
    #Get out of the loop to display the nearby venues gathered
    #Build a dataframe out of each item (which is now represented by a tuple) in a venue_list (which will be a row)
    #in the venues_list (which is the list of venue_list's) that will finally be the dataframe
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    #We are basically telling Python up here that an item is a cell, and that the cell is in a tuple (which is a row)
    #and that the tuple is in a list of tuples (which are the columns of the dataframe as they are stacked one over another)
    
    #The indices will generate automatically as integers from 0 to n, so will be the column names
    #So let's change the column names:
    nearby_venues.columns = ['Department','Neighborhood',
                            'Neighborhood_latitude',
                            'Neighborhood_longitude',
                            'Venue',
                            'Venue_latitude',
                            'Venue_longitude',
                            'Venue_category']
    
    #Return a dataframe that contains all the nearby venues retrieved for each neighborhood
    return (nearby_venues)

Apply the function above on each neighborhood from the "Haitian Neighborhoods dataframe" (df) and store the venues received in a toronto_venues dataframe:

In [127]:
# We don't need to pass the arguments limit and radius since they're given a default value that suits us

haitian_venues = getNearbyVenues(depts = df['Department'], 
                                 names=df['Neighborhood'], 
                                 latitudes=df['Nei_lat'], 
                                 longitudes=df['Nei_lng'])

print(haitian_venues.shape[0])
haitian_venues.head()

Gonaïves
Petite Rivière de l’Artibonite
Ennery
L'Estère
Gros Morne
Terre-Neuve
Anse-Rouge
Sources Chaudes
Saint-Marc
Montrouis
Verrettes
Désarmes
Deschapelles
Liancourt
Dessalines (Marchand-Dessalines)
Petite-Rivière-de-l’Artibonite
Savane à Roches
Grande-Saline
Desdunes
Marmelade
Saint-Michel-de-l’Attalaye
Hinche
Los Palis
Maïssade
Louverture
Thomonde
Cerca-Carvajal
Mirebalais
Dufailly
Saut-d’Eau
Boucan-Carré
Péligre
Lascahobas
Belladère
Baptiste
Savanette
Cerca-la-Source
Saltadère
Lamielle
Thomassique
Jérémie
 Léon
 Marfranc
Abricots
Bonbon
Moron
Sources Chaudes
Chambellan
Anse-d’Hainault
Dame-Marie
Cesson
Les Irois
Carcasse
Corail
Roseaux
Beaumont
Pestel
Miragoâne
Saint-Michel-du-Sud
Paillant
Fonds-des-Nègres
Petite-Rivière-de-Nippes
Anse-à-Veau
Baradères
Grand-Boucan
Petit-Trou-de-Nippes
L’Asile
Cap-Haïtien (la ville)
Bande-du-Nord (quartier du Cap)
Labadie (quartier du Cap)
Haut-du-Cap (quartier du Cap)
Petite-Anse
Quartier-Morin
Limonade
Acul-du-Nord
La Soufrière
Plaine-du-Nord
R

Unnamed: 0,Department,Neighborhood,Neighborhood_latitude,Neighborhood_longitude,Venue,Venue_latitude,Venue_longitude,Venue_category
0,Artibonite,Gonaïves,19.44606,-72.688434,Celeste Bar Resto,19.447856,-72.690723,Restaurant
1,Artibonite,Gonaïves,19.44606,-72.688434,Place Bouteille des Gonaives,19.446451,-72.68844,Theme Park
2,Artibonite,Gonaïves,19.44606,-72.688434,Fantastic Club,19.44739,-72.68936,Nightclub
3,Artibonite,Gonaïves,19.44606,-72.688434,K-jou Point Bar,19.447497,-72.691381,Bar
4,Artibonite,Gonaïves,19.44606,-72.688434,Place d'Armes Des Gonaives,19.44576,-72.69077,Plaza


In [128]:
len(haitian_venues)

189

Let's group the haitian_venues table by Neighborhood and count how many venues we have per neighborhood:

In [129]:
haitian_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Department,Neighborhood_latitude,Neighborhood_longitude,Venue,Venue_latitude,Venue_longitude,Venue_category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anse à Galets (La Gonâve),4,4,4,4,4,4,4
Aquin,4,4,4,4,4,4,4
Arcachon,2,2,2,2,2,2,2
Arcahaie,1,1,1,1,1,1,1
Bande-du-Nord (quartier du Cap),1,1,1,1,1,1,1
Bas Peu de Choses,5,5,5,5,5,5,5
Bizoton,2,2,2,2,2,2,2
Bois Verna,6,6,6,6,6,6,6
Bon Repos,2,2,2,2,2,2,2
Bourdon,2,2,2,2,2,2,2


In [130]:
# create map
location_Haiti = geolocator.geocode('Haiti')

map_venues = folium.Map(location=[location_Haiti.latitude, location_Haiti.longitude], 
                          zoom_start=8)


# Define markers colors intuitively representing the meaning of each cluster
markers_colors = ['red', 'orange', 'blue', 'green', 'yellow', 'black', 'grey', 'orangered', 'salmon', 'cyan']
dept_names = list(set(df.Department))

# Loop through the neighborhoods and add a marker for each
for lat, lon, venue_name, neigh, dept in zip(haitian_venues['Venue_latitude'], 
                                  haitian_venues['Venue_longitude'], 
                                  haitian_venues['Venue'], 
                                  haitian_venues['Neighborhood'], 
                                  haitian_venues['Department']):
    label = folium.Popup(venue_name + ' (around ' + neigh + ','+ dept + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=markers_colors[dept_names.index(dept)],
        fill=True,
        fill_color=markers_colors[dept_names.index(dept)],
        fill_opacity=0.7).add_to(map_venues)
       
map_venues

In [131]:
dep_color_codes = pd.DataFrame({'Department': dept_names, 'Color Code': markers_colors})

In [132]:
dep_color_codes

Unnamed: 0,Department,Color Code
0,Grand'Anse,red
1,Nord-Ouest,orange
2,Nord,blue
3,Nippes,green
4,Nord-Est,yellow
5,Artibonite,black
6,Sud-Est,grey
7,Centre,orangered
8,Ouest,salmon
9,Sud,cyan


In [133]:
map_venues.save('Venues by Departement.html')

## IV. Neighborhoods Clustering

Now we will one-hot encode the categories in order to cluster the Neighborhoods based on their mean values

In [134]:
print('There are {} unique categories of venues in the Haitian Venues sample.'.
      format(len(haitian_venues['Venue_category'].unique())))

There are 78 unique categories of venues in the Haitian Venues sample.


In [135]:
haitian_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189 entries, 0 to 188
Data columns (total 8 columns):
Department                189 non-null object
Neighborhood              189 non-null object
Neighborhood_latitude     189 non-null float64
Neighborhood_longitude    189 non-null float64
Venue                     189 non-null object
Venue_latitude            189 non-null float64
Venue_longitude           189 non-null float64
Venue_category            189 non-null object
dtypes: float64(4), object(4)
memory usage: 11.9+ KB


In [136]:
haitian_venues.describe()

Unnamed: 0,Neighborhood_latitude,Neighborhood_longitude,Venue_latitude,Venue_longitude
count,189.0,189.0,189.0,189.0
mean,18.79832,-72.490967,18.798138,-72.490855
std,0.491478,0.448096,0.491539,0.448204
min,18.1642,-74.216984,18.168433,-74.215872
25%,18.516915,-72.633426,18.513499,-72.633708
50%,18.538834,-72.332276,18.539231,-72.335486
75%,19.102027,-72.283468,19.099676,-72.28249
max,19.938986,-71.725833,19.940125,-71.726019


In [137]:
# One-hot encode the venue categories
ht_one_hot = pd.get_dummies(haitian_venues['Venue_category'], prefix='', prefix_sep='')

# Add the 'Neighborhood' to this new dataset
ht_one_hot['Neighborhood'] = haitian_venues['Neighborhood']

ht_one_hot.head()

Unnamed: 0,Airport,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Court,Beach,Beer Garden,Bistro,Boat or Ferry,Border Crossing,Breakfast Spot,Burger Joint,Bus Station,Café,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Duty-free Shop,Eastern European Restaurant,Electronics Store,Farm,Farmers Market,Fast Food Restaurant,Flea Market,Flower Shop,Food & Drink Shop,Football Stadium,French Restaurant,Furniture / Home Store,Garden Center,Grocery Store,Gym / Fitness Center,Historic Site,Hobby Shop,Hotel,Hotel Bar,Hotel Pool,IT Services,Ice Cream Shop,Italian Restaurant,Lake,Liquor Store,Market,Mobile Phone Shop,Mountain,Moving Target,Music Venue,Nightclub,Park,Performing Arts Venue,Pharmacy,Pier,Pizza Place,Plaza,Rest Area,Restaurant,Sandwich Place,Scenic Lookout,Shopping Mall,Snack Place,Soccer Field,Souvenir Shop,Sports Bar,Steakhouse,Supermarket,Tennis Court,Theme Park,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,Gonaïves
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,Gonaïves
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Gonaïves
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Gonaïves
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Gonaïves


In [138]:
# Move the 'Neighborhood' column to the first position
arrange_col = [ht_one_hot.columns[-1]] + list(ht_one_hot.columns)[:-1]
ht_onehot = ht_one_hot[arrange_col]

ht_onehot.head()

Unnamed: 0,Neighborhood,Airport,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Court,Beach,Beer Garden,Bistro,Boat or Ferry,Border Crossing,Breakfast Spot,Burger Joint,Bus Station,Café,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Duty-free Shop,Eastern European Restaurant,Electronics Store,Farm,Farmers Market,Fast Food Restaurant,Flea Market,Flower Shop,Food & Drink Shop,Football Stadium,French Restaurant,Furniture / Home Store,Garden Center,Grocery Store,Gym / Fitness Center,Historic Site,Hobby Shop,Hotel,Hotel Bar,Hotel Pool,IT Services,Ice Cream Shop,Italian Restaurant,Lake,Liquor Store,Market,Mobile Phone Shop,Mountain,Moving Target,Music Venue,Nightclub,Park,Performing Arts Venue,Pharmacy,Pier,Pizza Place,Plaza,Rest Area,Restaurant,Sandwich Place,Scenic Lookout,Shopping Mall,Snack Place,Soccer Field,Souvenir Shop,Sports Bar,Steakhouse,Supermarket,Tennis Court,Theme Park
0,Gonaïves,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,Gonaïves,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Gonaïves,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Gonaïves,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Gonaïves,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


We have 79 columns now that we have added the `Neighborhood` which is not a category

In [139]:
# Confirm the new columns shape is still equal to the number of categories

ht_onehot.shape

(189, 79)

We will cluster based on the most frequent categories of venues in each Neighborhood. We compute the frequencies by taking the means of the dummies grouped by respective Neighborhood:

In [140]:
# Compute mean of occurences of categories in each 'Neighborhood'

ht_grouped = ht_onehot.groupby('Neighborhood').mean().reset_index()
ht_grouped

Unnamed: 0,Neighborhood,Airport,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Court,Beach,Beer Garden,Bistro,Boat or Ferry,Border Crossing,Breakfast Spot,Burger Joint,Bus Station,Café,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Duty-free Shop,Eastern European Restaurant,Electronics Store,Farm,Farmers Market,Fast Food Restaurant,Flea Market,Flower Shop,Food & Drink Shop,Football Stadium,French Restaurant,Furniture / Home Store,Garden Center,Grocery Store,Gym / Fitness Center,Historic Site,Hobby Shop,Hotel,Hotel Bar,Hotel Pool,IT Services,Ice Cream Shop,Italian Restaurant,Lake,Liquor Store,Market,Mobile Phone Shop,Mountain,Moving Target,Music Venue,Nightclub,Park,Performing Arts Venue,Pharmacy,Pier,Pizza Place,Plaza,Rest Area,Restaurant,Sandwich Place,Scenic Lookout,Shopping Mall,Snack Place,Soccer Field,Souvenir Shop,Sports Bar,Steakhouse,Supermarket,Tennis Court,Theme Park
0,Anse à Galets (La Gonâve),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aquin,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arcachon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arcahaie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bande-du-Nord (quartier du Cap),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Bas Peu de Choses,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Bizoton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bois Verna,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Bon Repos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Bourdon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The above results are a bit confusing and some of them are not really significant. We may have a better idea of the results if we print the top 10 most frequent venue categories in each neighborhood instead:

In [141]:
num_top_venues = 10

for hood in ht_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ht_grouped[(ht_grouped['Neighborhood'] == hood)].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Anse à Galets (La Gonâve)----
               venue  freq
0      Grocery Store  0.25
1              Beach  0.25
2       Burger Joint  0.25
3              Hotel  0.25
4        Music Venue  0.00
5      Moving Target  0.00
6           Mountain  0.00
7  Mobile Phone Shop  0.00
8             Market  0.00
9       Liquor Store  0.00


----Aquin----
                  venue  freq
0  Caribbean Restaurant  0.25
1   Arts & Crafts Store  0.25
2                Bakery  0.25
3           IT Services  0.25
4                  Lake  0.00
5             Nightclub  0.00
6           Music Venue  0.00
7         Moving Target  0.00
8              Mountain  0.00
9     Mobile Phone Shop  0.00


----Arcachon----
                venue  freq
0                Café   0.5
1       Moving Target   0.5
2  Italian Restaurant   0.0
3         Music Venue   0.0
4            Mountain   0.0
5   Mobile Phone Shop   0.0
6              Market   0.0
7        Liquor Store   0.0
8                Lake   0.0
9      Ice Cream Shop   

9         IT Services   0.0


----Lalue----
                  venue  freq
0     Electronics Store  0.17
1                  Park  0.17
2                 Hotel  0.17
3                Market  0.17
4  Fast Food Restaurant  0.17
5           Pizza Place  0.17
6    Italian Restaurant  0.00
7              Mountain  0.00
8     Mobile Phone Shop  0.00
9          Liquor Store  0.00


----Lascahobas----
                venue  freq
0              Market   0.5
1                 Bar   0.5
2             Airport   0.0
3  Italian Restaurant   0.0
4         Music Venue   0.0
5       Moving Target   0.0
6            Mountain   0.0
7   Mobile Phone Shop   0.0
8        Liquor Store   0.0
9                Lake   0.0


----Les Anglais----
                venue  freq
0                Farm   1.0
1             Airport   0.0
2      Ice Cream Shop   0.0
3       Moving Target   0.0
4            Mountain   0.0
5   Mobile Phone Shop   0.0
6              Market   0.0
7        Liquor Store   0.0
8                Lake  

                venue  freq
0      Cosmetics Shop   1.0
1             Airport   0.0
2  Italian Restaurant   0.0
3       Moving Target   0.0
4            Mountain   0.0
5   Mobile Phone Shop   0.0
6              Market   0.0
7        Liquor Store   0.0
8                Lake   0.0
9      Ice Cream Shop   0.0


----Thomonde----
                       venue  freq
0  Cajun / Creole Restaurant   1.0
1                    Airport   0.0
2         Italian Restaurant   0.0
3                Music Venue   0.0
4              Moving Target   0.0
5                   Mountain   0.0
6          Mobile Phone Shop   0.0
7                     Market   0.0
8               Liquor Store   0.0
9                       Lake   0.0


----Thor----
                venue  freq
0           Nightclub  0.25
1               Hotel  0.25
2              Market  0.25
3              Bakery  0.25
4  Italian Restaurant  0.00
5         Music Venue  0.00
6       Moving Target  0.00
7            Mountain  0.00
8   Mobile Phone Shop

Let's write a function to return the most common venues

In [142]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create dataframe that will receive the 5 most common venues, because according to the data above, most neighborhoods have around 5 most frequent venues, the others having around 0% of occurence

In [143]:
num_top_venues = 10

# Define numerical suffixes for columns names
indicators = ['st', 'nd', 'rd']

# Create a single-element column list containing only the 'Neighborhood' variable
# Then append 10 other column names to the list following the format: 
# 'ind-st, ind-nd, ind-rd' or 'ind-th' for ind >= 3

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create an empty dataframe whose columns are the columns list created above through the 'for' loop
# And fill its first column with the 'Neighborhood' column from the toronto_grouped table
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ht_grouped['Neighborhood']

for ind in np.arange(ht_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ht_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anse à Galets (La Gonâve),Beach,Burger Joint,Grocery Store,Hotel,Theme Park,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
1,Aquin,Caribbean Restaurant,Arts & Crafts Store,Bakery,IT Services,Theme Park,Duty-free Shop,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
2,Arcachon,Moving Target,Café,Diner,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
3,Arcahaie,Market,Theme Park,Diner,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Duty-free Shop
4,Bande-du-Nord (quartier du Cap),Pier,Theme Park,Bus Station,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop


Let us make 5 clusters

In [144]:
# Drop 'Neighborhood' column since it's not numerical

ht_grouped_clustering = ht_grouped.drop('Neighborhood', axis=1)
kclusters = 5

In [145]:
# Run KMeans Clustering algorithm and have a peek at the labels

# We run the algorithm 15 times and choose the results that for which the distance between the clusters is maximized
# and the distance between the Neighborhoods is minimized, random_state is fixed at 0 for reproducibility
clusters = KMeans(init='k-means++', n_clusters=kclusters, n_init=20, random_state=0)
clusters.fit(ht_grouped_clustering)
clusters.labels_[:10]

array([1, 1, 1, 1, 1, 3, 3, 1, 1, 4])

Complete the ht_grouped data to make a more complete dataset, with initial data on Haitian Neighborhoods from df:

In [147]:
# Insert a new column named 'Cluster Labels' at position 0 of 'neighborhoods_venues_sorted' dataset and fill it with
# the labels from the KMeans clustering
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', clusters.labels_)

# Copy the 'dft' dataset
ht_merged = df.copy()

# Merge it with the 'neighborhood_venues_sorted' table with the join method (We could've used the pd.merge method, too)
ht_merged = ht_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ht_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,HT4110,Artibonite,Gonaïves,19.4461,-72.6884,1.0,Theme Park,Shopping Mall,Nightclub,Cosmetics Shop,Plaza,Bar,Restaurant,Gym / Fitness Center,Café,Eastern European Restaurant
1,HT4111,Artibonite,Petite Rivière de l’Artibonite,19.123,-72.48,1.0,Farm,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
2,HT4120,Artibonite,Ennery,19.4846,-72.4849,1.0,Campground,Plaza,Theme Park,Bus Station,Cajun / Creole Restaurant,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
3,HT4130,Artibonite,L'Estère,19.3032,-72.6102,1.0,Garden Center,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
4,HT4210,Artibonite,Gros Morne,19.6716,-72.6784,1.0,Diner,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Eastern European Restaurant


All neighborhoods that could not be put in a cluster have nan values, we have to get rid of them

In [148]:
ht_merged = ht_merged.dropna(subset=['Cluster Labels'])
ht_merged.shape

(80, 16)

By doing the above, we've gone from 189 venues to 80 venues only. We don't have the choice, we must use that little data to carry the analysis:

In [149]:
ht_merged['Cluster Labels'] = ht_merged['Cluster Labels'].astype(int)

In [150]:
ht_merged.head()

Unnamed: 0,Postal Code,Department,Neighborhood,Nei_lat,Nei_lng,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,HT4110,Artibonite,Gonaïves,19.4461,-72.6884,1,Theme Park,Shopping Mall,Nightclub,Cosmetics Shop,Plaza,Bar,Restaurant,Gym / Fitness Center,Café,Eastern European Restaurant
1,HT4111,Artibonite,Petite Rivière de l’Artibonite,19.123,-72.48,1,Farm,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
2,HT4120,Artibonite,Ennery,19.4846,-72.4849,1,Campground,Plaza,Theme Park,Bus Station,Cajun / Creole Restaurant,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
3,HT4130,Artibonite,L'Estère,19.3032,-72.6102,1,Garden Center,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
4,HT4210,Artibonite,Gros Morne,19.6716,-72.6784,1,Diner,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Eastern European Restaurant


We may try to name the clusters based on the most frequent venues:

_**Cluster 1**_ - Night Clubs

In [151]:
ht_merged.loc[ht_merged['Cluster Labels'] == 0,
                   ht_merged.columns[[2] + list(range(5, ht_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
204,Camp Perrin,0,Nightclub,Diner,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Duty-free Shop


In [152]:
ht_grouped[ht_grouped.Neighborhood=='Camp Perrin']

Unnamed: 0,Neighborhood,Airport,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Court,Beach,Beer Garden,Bistro,Boat or Ferry,Border Crossing,Breakfast Spot,Burger Joint,Bus Station,Café,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Duty-free Shop,Eastern European Restaurant,Electronics Store,Farm,Farmers Market,Fast Food Restaurant,Flea Market,Flower Shop,Food & Drink Shop,Football Stadium,French Restaurant,Furniture / Home Store,Garden Center,Grocery Store,Gym / Fitness Center,Historic Site,Hobby Shop,Hotel,Hotel Bar,Hotel Pool,IT Services,Ice Cream Shop,Italian Restaurant,Lake,Liquor Store,Market,Mobile Phone Shop,Mountain,Moving Target,Music Venue,Nightclub,Park,Performing Arts Venue,Pharmacy,Pier,Pizza Place,Plaza,Rest Area,Restaurant,Sandwich Place,Scenic Lookout,Shopping Mall,Snack Place,Soccer Field,Souvenir Shop,Sports Bar,Steakhouse,Supermarket,Tennis Court,Theme Park
10,Camp Perrin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


_**Cluster 2**_ - Bars & Boutiques

In [153]:
ht_merged.loc[ht_merged['Cluster Labels'] == 1,
                   ht_merged.columns[[2] + list(range(5, ht_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Gonaïves,1,Theme Park,Shopping Mall,Nightclub,Cosmetics Shop,Plaza,Bar,Restaurant,Gym / Fitness Center,Café,Eastern European Restaurant
1,Petite Rivière de l’Artibonite,1,Farm,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
2,Ennery,1,Campground,Plaza,Theme Park,Bus Station,Cajun / Creole Restaurant,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
3,L'Estère,1,Garden Center,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
4,Gros Morne,1,Diner,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Eastern European Restaurant
8,Saint-Marc,1,Shopping Mall,Basketball Court,Theme Park,Diner,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
15,Petite-Rivière-de-l’Artibonite,1,Farm,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
19,Marmelade,1,Campground,Theme Park,Duty-free Shop,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Eastern European Restaurant
24,Louverture,1,Airport,Airport Service,Airport Terminal,American Restaurant,Coffee Shop,Convenience Store,Duty-free Shop,Eastern European Restaurant,Caribbean Restaurant,Cocktail Bar
31,Péligre,1,Lake,Theme Park,Diner,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Duty-free Shop


In [154]:
c1 = list(ht_merged['1st Most Common Venue'][ht_merged['Cluster Labels'] == 1])

In [155]:
for venue in set(c1):
    print(venue, ':', c1.count(venue))

Plaza : 1
Bus Station : 1
Airport : 1
Liquor Store : 1
Hotel Pool : 1
Garden Center : 1
Snack Place : 1
Arts & Crafts Store : 1
Mountain : 2
Nightclub : 3
Hotel : 1
Boat or Ferry : 1
Art Gallery : 1
Campground : 3
Football Stadium : 1
Theme Park : 1
Bar : 4
Pharmacy : 1
Historic Site : 1
Souvenir Shop : 1
Moving Target : 1
Hobby Shop : 1
Food & Drink Shop : 2
Cosmetics Shop : 1
Diner : 1
Market : 2
Flea Market : 1
Lake : 1
Farmers Market : 1
Shopping Mall : 1
Burger Joint : 1
Tennis Court : 1
Border Crossing : 1
Caribbean Restaurant : 1
Fast Food Restaurant : 1
Bistro : 1
Steakhouse : 1
Farm : 3
Mobile Phone Shop : 1
Pier : 1
Park : 1
Beach : 2
Flower Shop : 1


_**Cluster 3**_ - Creole Restaurants

In [156]:
ht_merged.loc[ht_merged['Cluster Labels'] == 2,
                   ht_merged.columns[[2] + list(range(5, ht_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,Hinche,2,Cajun / Creole Restaurant,Asian Restaurant,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
25,Thomonde,2,Cajun / Creole Restaurant,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Theme Park
205,Maniche,2,Cajun / Creole Restaurant,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Theme Park
212,Vieux Bourg d’Aquin,2,Cajun / Creole Restaurant,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Theme Park


_**Cluster 4**_ - Hosting Places

In [157]:
ht_merged.loc[ht_merged['Cluster Labels'] == 3,
                   ht_merged.columns[[2] + list(range(5, ht_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Verrettes,3,Hotel,Theme Park,Duty-free Shop,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner
27,Mirebalais,3,Hotel,Restaurant,Park,Ice Cream Shop,Bar,Diner,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop
57,Miragoâne,3,Caribbean Restaurant,Theme Park,Duty-free Shop,Campground,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Eastern European Restaurant
61,Petite-Rivière-de-Nippes,3,Restaurant,Theme Park,Bus Station,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
82,Saint-Raphaël,3,Caribbean Restaurant,Scenic Lookout,Theme Park,Diner,Cajun / Creole Restaurant,Campground,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
105,Ouanaminthe,3,Caribbean Restaurant,Theme Park,Duty-free Shop,Campground,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop,Diner,Eastern European Restaurant
134,Jean-Rabel,3,Caribbean Restaurant,Convenience Store,Theme Park,Duty-free Shop,Campground,Cocktail Bar,Coffee Shop,Cosmetics Shop,Diner,Eastern European Restaurant
138,Bas Peu de Choses,3,Bar,Caribbean Restaurant,Fast Food Restaurant,Hotel,Restaurant,Duty-free Shop,Campground,Cocktail Bar,Coffee Shop,Convenience Store
154,Bizoton,3,Restaurant,Pharmacy,Theme Park,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
166,Frères,3,Sandwich Place,Restaurant,Fast Food Restaurant,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop


_**Cluster 5**_ - Parks

In [158]:
ht_merged.loc[ht_merged['Cluster Labels'] == 4,
                   ht_merged.columns[[2] + list(range(5, ht_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
108,Trou-du-Nord,4,Park,Theme Park,Bus Station,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
137,Bourdon,4,Park,Bakery,Theme Park,Diner,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
145,Martissant,4,Park,Theme Park,Bus Station,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop
184,Croix-des-Bouquets,4,Park,Theme Park,Bus Station,Cajun / Creole Restaurant,Campground,Caribbean Restaurant,Cocktail Bar,Coffee Shop,Convenience Store,Cosmetics Shop


In [159]:
haitian_venues[(haitian_venues.Neighborhood=='Trou-du-Nord')|
           (haitian_venues.Neighborhood=='Bourdon')|
           (haitian_venues.Neighborhood=='Martissant')|
           (haitian_venues.Neighborhood=='Croix-des-Bouquets')]

Unnamed: 0,Department,Neighborhood,Neighborhood_latitude,Neighborhood_longitude,Venue,Venue_latitude,Venue_longitude,Venue_category
58,Nord-Est,Trou-du-Nord,19.616781,-72.024863,Place Saint-Jean,19.618597,-72.021904,Park
75,Ouest,Bourdon,18.538669,-72.315371,Parc Bourdon,18.537654,-72.317073,Park
76,Ouest,Bourdon,18.538669,-72.315371,Folies Gourmandes,18.537329,-72.318102,Bakery
99,Ouest,Martissant,18.527369,-72.357049,Parc Memorial Martissant,18.525943,-72.359425,Park
159,Ouest,Croix-des-Bouquets,18.577544,-72.229622,Place Croix Des Bouquets,18.578215,-72.226747,Park


In [160]:
# create map
location_Haiti = geolocator.geocode('Haiti')

map_clusters = folium.Map(location=[location_Haiti.latitude, location_Haiti.longitude], 
                          zoom_start=8)


# Define markers colors intuitively representing the meaning of each cluster
icon_marker_urls = ['marker_icons/night_clubs.png',
                    'marker_icons/bars_boutiques.png',
                    'marker_icons/creole_restaurants.png',
                    'marker_icons/hosting_places.png',
                    'marker_icons/parks.png']

cluster_names = ['Night Clubs', 'Bars & Boutiques', 'Creole Restaurants', 'Hosting Places', 'Parks']

# Loop through the neighborhoods and add a marker for each
for lat, lon, nei, cluster in zip(ht_merged['Nei_lat'], 
                                    ht_merged['Nei_lng'], 
                                    ht_merged['Neighborhood'], 
                                    ht_merged['Cluster Labels']):
    
    label = folium.Popup(str(nei) + ' (' + cluster_names[cluster] + ')', parse_html=True)
    
    
    folium.Marker(
        [lat, lon],
        popup=label,
        icon = folium.features.CustomIcon(icon_marker_urls[cluster], icon_size=(50, 50))
        ).add_to(map_clusters)
       
map_clusters

In [161]:
map_clusters.save('Neighborhoods Clustered by Venues Frequency in vicinity.html')

**With this map, anybody knows which venues they're more likely to find in a radius of 500 meters around the neighborhood they are in or they are about to go to. The map may serve as a way to plan travels, as well as an indicator especially when there is no internet in the area and they can't check google maps. It is a good alternative as well, if google's data on venues is unsufficient.**

_For consultations, reach out to Data Scientist © Grégory PINCHINAT by writing an email to: gregpinchy@gmail.com._