### Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import csv

### Getting the source code webpage

In [2]:
source = requests.get('https://www.annuaire-administration.com/code-postal/departement/paris.html').text 
soup = bs(source, 'lxml')

### Creating a csv File and adding the columns name to it

In [3]:
table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('width') and tag['width']=="650" 
                  and tag.has_attr('cellpadding') and tag['cellpadding']=="0" and tag.has_attr('align') and tag['align']=="center") 

rows = table.find_all(lambda tag: tag.name=='tr')

In [4]:
csv_file = open('paris_postal_codes.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postcodes', 'Arrondissement'])
Postcodes=[]
Arrondissement=[]

### Scrapping the webpage to only get the table

In [5]:
for row in rows:
    columns = row.find_all('td')
    postcode = columns[0].text
    Postcodes.append(postcode)
    arron=columns[1].text
    Arrondissement.append(arron)
for a,b in zip(Postcodes,Arrondissement):
    csv_writer.writerow([a, b]) # Writing the rows in the csv file

### Close the csv file

In [6]:
csv_file.close()

### Creating the dataframe


In [7]:
df = pd.read_csv('paris_postal_codes.csv')

In [8]:
df=df.iloc[2:]
df.reset_index(drop=True, inplace=True)

In [9]:
df['Postcodes'] = df['Postcodes'].replace('\r\n','', regex=True)
df['Arrondissement'] = df['Arrondissement'].replace('\r\n','', regex=True)
df['Postcodes'] = df['Postcodes'].replace('Code Postal','', regex=True)

### Checking the dataframe

In [10]:
df

Unnamed: 0,Postcodes,Arrondissement
0,75001,Paris 1er Arrondissement
1,75002,Paris 2e Arrondissement
2,75003,Paris 3e Arrondissement
3,75004,Paris 4e Arrondissement
4,75005,Paris 5e Arrondissement
5,75006,Paris 6e Arrondissement
6,75007,Paris 7e Arrondissement
7,75008,Paris 8e Arrondissement
8,75009,Paris 9e Arrondissement
9,75010,Paris 10e Arrondissement


In [11]:
df2=pd.read_csv("laboratoires-danalyses-medicales.csv",sep=';')
df2

Unnamed: 0,CODE_CPAM,NUM_FINESS,LISTE,LABORATOIRE,ADRESSE,CODE_POSTAL,VILLE,TELEPHONE,HORAIRES,geo_shape,geo_point_2d
0,75370111,ND,11ème - Boulevard Richard Lenoir,Laboratoire Cerballiance site Chemin Vert,42 boulevard Richard Lenoir,75011,Paris,01 43 55 87 40,"Du lundi au vendredi de 7h30 à 18h30, samedi d...","{""type"": ""Point"", ""coordinates"": [2.3716544701...","48.8577489375,2.37165447016"
1,75370177,ND,12ème - Rue Jacques Hillairet,Laboratoire Cerballiance site Daumesnil,33 rue Jacques Hillairet,75012,Paris,01 44 87 01 01,"Du lundi au vendredi de 7h30 à 17h15, samedi d...","{""type"": ""Point"", ""coordinates"": [2.3888482980...","48.8433686592,2.38884829808"
2,75370146,ND,14ème - Avenue Denfert Rochereau,Laboratoire BioGroup LCD site Denfert,87 avenue Denfert Rochereau,75014,Paris,01 44 41 70 50,"Du lundi au vendredi de 7h30 à 18h30, samedi d...","{""type"": ""Point"", ""coordinates"": [2.3336349788...","48.8353183115,2.33363497886"
3,75370188,ND,15ème - Rue de Vouillé,Laboratoire Cerballiance site Vouillé,20 rue de Vouillé,75015,Paris,01 45 31 55 20,"Du lundi au vendredi de 7h à 18h, samedi de 7h...","{""type"": ""Point"", ""coordinates"": [2.3043358410...","48.8347838657,2.30433584102"
4,75370040,ND,16ème - Rue de Chaillot,Laboratoire Unlabs site Paris Chaillot,10 rue de Chaillot,75116,Paris,01 53 57 40 40,"Du lundi au vendredi de 8h à 19h, samedi de 8h...","{""type"": ""Point"", ""coordinates"": [2.2972156833...","48.8669660284,2.29721568337"
...,...,...,...,...,...,...,...,...,...,...,...
151,75378287,ND,9ème - Rue Drouot,Laboratoire Drouot,21 rue Drouot,75009,Paris,01 45 23 10 45,"Du lundi au vendredi de 8h à 19h, samedi de 8h...","{""type"": ""Point"", ""coordinates"": [2.3406816342...","48.8743241164,2.34068163426"
152,75370117,ND,18ème - Rue Marx Dormoy,Laboratoire BioGroup LCD site Marx Dormoy,59 rue Marx Dormoy,75018,Paris,01 42 09 33 66,"Du lundi au vendredi de 7h à 19h, samedi de 7h...","{""type"": ""Point"", ""coordinates"": [2.3597052690...","48.8891529597,2.35970526905"
153,75370196,ND,18ème - Rue de Trétaigne,Laboratoire BioGroup LCD site Duchemin,19 rue de Trétaigne,75018,Paris,01 42 23 50 00,"Du lundi au vendredi de 7h à 19h, samedi de 8h...","{""type"": ""Point"", ""coordinates"": [2.3424324358...","48.8926257546,2.34243243584"
154,75370457,ND,20ème - Rue de Bagnolet,Laboratoire Biofutur site Bagnolet,55 rue de Bagnolet,75020,Paris,01 43 70 34 75,Du lundi au vendredi de 7h45 à 18h30,"{""type"": ""Point"", ""coordinates"": [2.3982132559...","48.8571855667,2.39821325595"


### Clean our dataset

In [12]:
df2[['lat','lng']]=df2['geo_point_2d'].str.split(',',expand=True)
df2

Unnamed: 0,CODE_CPAM,NUM_FINESS,LISTE,LABORATOIRE,ADRESSE,CODE_POSTAL,VILLE,TELEPHONE,HORAIRES,geo_shape,geo_point_2d,lat,lng
0,75370111,ND,11ème - Boulevard Richard Lenoir,Laboratoire Cerballiance site Chemin Vert,42 boulevard Richard Lenoir,75011,Paris,01 43 55 87 40,"Du lundi au vendredi de 7h30 à 18h30, samedi d...","{""type"": ""Point"", ""coordinates"": [2.3716544701...","48.8577489375,2.37165447016",48.8577489375,2.37165447016
1,75370177,ND,12ème - Rue Jacques Hillairet,Laboratoire Cerballiance site Daumesnil,33 rue Jacques Hillairet,75012,Paris,01 44 87 01 01,"Du lundi au vendredi de 7h30 à 17h15, samedi d...","{""type"": ""Point"", ""coordinates"": [2.3888482980...","48.8433686592,2.38884829808",48.8433686592,2.38884829808
2,75370146,ND,14ème - Avenue Denfert Rochereau,Laboratoire BioGroup LCD site Denfert,87 avenue Denfert Rochereau,75014,Paris,01 44 41 70 50,"Du lundi au vendredi de 7h30 à 18h30, samedi d...","{""type"": ""Point"", ""coordinates"": [2.3336349788...","48.8353183115,2.33363497886",48.8353183115,2.33363497886
3,75370188,ND,15ème - Rue de Vouillé,Laboratoire Cerballiance site Vouillé,20 rue de Vouillé,75015,Paris,01 45 31 55 20,"Du lundi au vendredi de 7h à 18h, samedi de 7h...","{""type"": ""Point"", ""coordinates"": [2.3043358410...","48.8347838657,2.30433584102",48.8347838657,2.30433584102
4,75370040,ND,16ème - Rue de Chaillot,Laboratoire Unlabs site Paris Chaillot,10 rue de Chaillot,75116,Paris,01 53 57 40 40,"Du lundi au vendredi de 8h à 19h, samedi de 8h...","{""type"": ""Point"", ""coordinates"": [2.2972156833...","48.8669660284,2.29721568337",48.8669660284,2.29721568337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,75378287,ND,9ème - Rue Drouot,Laboratoire Drouot,21 rue Drouot,75009,Paris,01 45 23 10 45,"Du lundi au vendredi de 8h à 19h, samedi de 8h...","{""type"": ""Point"", ""coordinates"": [2.3406816342...","48.8743241164,2.34068163426",48.8743241164,2.34068163426
152,75370117,ND,18ème - Rue Marx Dormoy,Laboratoire BioGroup LCD site Marx Dormoy,59 rue Marx Dormoy,75018,Paris,01 42 09 33 66,"Du lundi au vendredi de 7h à 19h, samedi de 7h...","{""type"": ""Point"", ""coordinates"": [2.3597052690...","48.8891529597,2.35970526905",48.8891529597,2.35970526905
153,75370196,ND,18ème - Rue de Trétaigne,Laboratoire BioGroup LCD site Duchemin,19 rue de Trétaigne,75018,Paris,01 42 23 50 00,"Du lundi au vendredi de 7h à 19h, samedi de 8h...","{""type"": ""Point"", ""coordinates"": [2.3424324358...","48.8926257546,2.34243243584",48.8926257546,2.34243243584
154,75370457,ND,20ème - Rue de Bagnolet,Laboratoire Biofutur site Bagnolet,55 rue de Bagnolet,75020,Paris,01 43 70 34 75,Du lundi au vendredi de 7h45 à 18h30,"{""type"": ""Point"", ""coordinates"": [2.3982132559...","48.8571855667,2.39821325595",48.8571855667,2.39821325595


In [13]:
df2["lat"] = pd.to_numeric(df2["lat"])
df2["lng"] = pd.to_numeric(df2["lng"])

### Using the OpenCage GeoCoder API

In [14]:
API_KEY='1c54d5371e674c679c38e4fe2b1b1bec'

In [15]:
import json

latitudes = [] # Initializing the latitude array
longitudes = [] # Initializing the longitude array

for postal_code in df["Arrondissement"]: 
    place_name = postal_code  # Formats the place name
    url = 'https://api.opencagedata.com/geocode/v1/json?q={}&key={}'.format(place_name, API_KEY) # Gets the proper url to make the API call
    obj = json.loads(requests.get(url).text) # Loads the JSON file in the form of a python dictionary
    results = obj['results'] # Extracts the results information out of the JSON file
    lat = results[0]['geometry']['lat'] # Extracts the latitude value
    lng = results[0]['geometry']['lng'] # Extracts the longitude value
    latitudes.append(lat) # Appending to the list of latitudes
    longitudes.append(lng) # Appending to the list of longitudes

In [16]:
df['Latitude'] = latitudes
df['Longitude'] = longitudes

In [17]:
df

Unnamed: 0,Postcodes,Arrondissement,Latitude,Longitude
0,75001,Paris 1er Arrondissement,48.864614,2.334396
1,75002,Paris 2e Arrondissement,48.867684,2.343126
2,75003,Paris 3e Arrondissement,48.864212,2.360936
3,75004,Paris 4e Arrondissement,48.856202,2.355619
4,75005,Paris 5e Arrondissement,48.845973,2.34435
5,75006,Paris 6e Arrondissement,48.850433,2.332951
6,75007,Paris 7e Arrondissement,48.857028,2.320195
7,75008,Paris 8e Arrondissement,48.87748,2.31765
8,75009,Paris 9e Arrondissement,48.876019,2.339962
9,75010,Paris 10e Arrondissement,48.876126,2.359839


### Paris Map using folium


In [18]:
import folium
# Toronto latitude and longitude using Google search
paris_lat =48.8566
paris_lng = 2.3522

# Creates map of Toronto using latitude and longitude values
map_paris = folium.Map(location=[paris_lat, paris_lng], zoom_start=12)

# Add markers to map
for lat, lng, arron in zip(df['Latitude'], df['Longitude'], df['Arrondissement']):
    label = '{}'.format(arron)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_paris)  
    
map_paris

### Adding the labs to paris map

In [19]:
for lat, lng, lab in zip(df2['lat'], df2['lng'], df2['LABORATOIRE']):
    label = '{}'.format(lab )
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_paris)  

In [20]:
map_paris

### Foursquare API

In [41]:
CLIENT_ID = 'P5RVXQMDTMJTVLZDSAXAFLPYW1CSZ3A5KSMI2OOXWLQQ3LGI' # your Foursquare ID
CLIENT_SECRET = 'VWQVCHB4ERNCBRXREGNKWWIMSKZVKJUTDBFSDXAL0YJKFK1S' # your Foursquare Secret
ACCESS_TOKEN = 'DR4VOEHRB3NBWSSFJ0NEOYB43MNJ1SVDO1DQCXJ4C3J1C0JU' # your FourSquare Access Token
VERSION = '20190425'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: P5RVXQMDTMJTVLZDSAXAFLPYW1CSZ3A5KSMI2OOXWLQQ3LGI
CLIENT_SECRET:VWQVCHB4ERNCBRXREGNKWWIMSKZVKJUTDBFSDXAL0YJKFK1S


### Librairies for clustering and treating json files

In [22]:

# Downloading folium, if not installed
#!conda install -c conda-forge folium=0.5.0 --yes
import folium # Map plotting library
import numpy as np
from pandas.io.json import json_normalize
from pandas.io.json import json_normalize
# Tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import k-means from clustering stage
from sklearn.cluster import KMeans

In [23]:
def get_category_type(row):
    try:
        categories_list = row['Category']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


### Using the FourSquare API on all neighbourhoods


In [42]:
for i, nbd_name in enumerate(df2['LISTE']):  
    try :         
        ### Getting the data of neighbourhood
        nbd_name = df2.loc[i, 'LISTE']
        nbd_lat = df2.loc[i, 'lat']
        nbd_lng = df2.loc[i, 'lng']
        radius = 500 # Setting the radius as 500 metres
        LIMIT = 100 # Getting the top 100 venues
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&categoryId=4bf58dd8d48988d196941735,4d4b7105d754a06372d81259,4bf58dd8d48988d13b941735,4bf58dd8d48988d129951735&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, nbd_lat, nbd_lng, VERSION, radius, LIMIT)
        results = json.loads(requests.get(url).text)
        results = results['response']['groups'][0]['items']
        nearby = json_normalize(results) # Flattens JSON
        # Filtering the columns
        filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
        nearby = nearby.loc[:, filtered_columns]
        # Renaming the columns
        columns = ['Name', 'Category', 'Latitude', 'Longitude']
        nearby.columns = columns
        # Gets the categories
        nearby['Category'] = nearby.apply(get_category_type, axis=1)
        # Gets the data required
        for i, name in enumerate(nearby['Name']):
            explore_df_list.append([nbd_name, nbd_lat, nbd_lng] + nearby.loc[i, :].values.tolist())
    except Exception as e:
        pass

  nearby = json_normalize(results) # Flattens JSON


In [43]:
explore_df = pd.DataFrame([item for item in explore_df_list])
explore_df.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue Name', 'Venue Category', 'Venue Latitude', 'Venue Longitude']
explore_df.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Institut Fassaha,School,48.860404,2.374422
1,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École Maternelle Alphonse Baudin,School,48.86122,2.37011
2,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Cabinet de psychotherapie Emmanuelle LECOMTE,Medical Center,48.85367,2.372561
3,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École Nationale Supérieure de Création Industr...,General College & University,48.858423,2.369635
4,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École élémentaire de la rue Froment,Elementary School,48.857646,2.372632


### Adding the dataset to the API request results

In [44]:
toappend=df2[["LISTE","lat", "lng","LABORATOIRE"]]
toappend=toappend.rename(columns={"LISTE": "Neighbourhood", "lat": "Neighbourhood Latitude","lng": "Neighbourhood Longitude","LABORATOIRE": "Venue Name"})
toappend['Venue Category']="Laboratoire"
toappend['Venue Latitude']=toappend["Neighbourhood Latitude"]
toappend['Venue Longitude']=toappend["Neighbourhood Longitude"]

In [45]:
toappend

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Laboratoire Cerballiance site Chemin Vert,Laboratoire,48.857749,2.371654
1,12ème - Rue Jacques Hillairet,48.843369,2.388848,Laboratoire Cerballiance site Daumesnil,Laboratoire,48.843369,2.388848
2,14ème - Avenue Denfert Rochereau,48.835318,2.333635,Laboratoire BioGroup LCD site Denfert,Laboratoire,48.835318,2.333635
3,15ème - Rue de Vouillé,48.834784,2.304336,Laboratoire Cerballiance site Vouillé,Laboratoire,48.834784,2.304336
4,16ème - Rue de Chaillot,48.866966,2.297216,Laboratoire Unlabs site Paris Chaillot,Laboratoire,48.866966,2.297216
...,...,...,...,...,...,...,...
151,9ème - Rue Drouot,48.874324,2.340682,Laboratoire Drouot,Laboratoire,48.874324,2.340682
152,18ème - Rue Marx Dormoy,48.889153,2.359705,Laboratoire BioGroup LCD site Marx Dormoy,Laboratoire,48.889153,2.359705
153,18ème - Rue de Trétaigne,48.892626,2.342432,Laboratoire BioGroup LCD site Duchemin,Laboratoire,48.892626,2.342432
154,20ème - Rue de Bagnolet,48.857186,2.398213,Laboratoire Biofutur site Bagnolet,Laboratoire,48.857186,2.398213


### Final Result

In [46]:
explore_df.append(toappend,ignore_index = True)

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Institut Fassaha,School,48.860404,2.374422
1,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École Maternelle Alphonse Baudin,School,48.861220,2.370110
2,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Cabinet de psychotherapie Emmanuelle LECOMTE,Medical Center,48.853670,2.372561
3,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École Nationale Supérieure de Création Industr...,General College & University,48.858423,2.369635
4,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École élémentaire de la rue Froment,Elementary School,48.857646,2.372632
...,...,...,...,...,...,...,...
2625,9ème - Rue Drouot,48.874324,2.340682,Laboratoire Drouot,Laboratoire,48.874324,2.340682
2626,18ème - Rue Marx Dormoy,48.889153,2.359705,Laboratoire BioGroup LCD site Marx Dormoy,Laboratoire,48.889153,2.359705
2627,18ème - Rue de Trétaigne,48.892626,2.342432,Laboratoire BioGroup LCD site Duchemin,Laboratoire,48.892626,2.342432
2628,20ème - Rue de Bagnolet,48.857186,2.398213,Laboratoire Biofutur site Bagnolet,Laboratoire,48.857186,2.398213


In [47]:
len(explore_df['Venue Category'].unique())

74

### One Hot encoding for Scikit-Learn

In [90]:
# One hot encoding
paris_onehot = pd.get_dummies(explore_df[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
paris_onehot['Neighbourhood'] = explore_df['Neighbourhood'] 

# Move neighborhood column to the first column
fixed_columns = [paris_onehot.columns[-1]] + list(paris_onehot.columns[:-1])
paris_onehot = paris_onehot[fixed_columns]

paris_onehot.head()

Unnamed: 0,Neighbourhood,Adult Education Center,Alternative Healer,Art Gallery,Athletics & Sports,Building,Business Service,Café,College & University,College Academic Building,...,Sorority House,Student Center,Swim School,Tech Startup,Theater,Tourist Information Center,Trade School,Train,Train Station,University
0,11ème - Boulevard Richard Lenoir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11ème - Boulevard Richard Lenoir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11ème - Boulevard Richard Lenoir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11ème - Boulevard Richard Lenoir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11ème - Boulevard Richard Lenoir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Grouping by neighbourhood

In [52]:
paris_grouped = paris_onehot.groupby('Neighbourhood').mean().reset_index()
paris_grouped.head()

Unnamed: 0,Neighbourhood,Adult Education Center,Alternative Healer,Art Gallery,Athletics & Sports,Building,Business Service,Café,College & University,College Academic Building,...,Sorority House,Student Center,Swim School,Tech Startup,Theater,Tourist Information Center,Trade School,Train,Train Station,University
0,10ème - Boulevard de Magenta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044118,0.0,...,0.0,0.0,0.0,0.0,0.029412,0.0,0.117647,0.029412,0.073529,0.029412
1,10ème - Rue La Fayette,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057143,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.057143,0.085714,0.142857,0.057143
2,10ème - Rue de Dunkerque - Gare du Nord,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.111111,0.111111,0.027778
3,10ème - Rue du Faubourg Saint Martin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103448,...,0.0,0.034483,0.0,0.0,0.0,0.0,0.206897,0.0,0.103448,0.068966
4,11ème - Avenue Ledru Rollin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Function for most commun venues

In [53]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Creating a new dataframe to get the top 10 venues


In [80]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# Create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# Create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = paris_grouped['Neighbourhood']

for ind in np.arange(paris_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(paris_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,10ème - Boulevard de Magenta,Platform,Trade School,High School,Train Station,Middle School,College & University,College Arts Building,University,Medical School,Nursery School
1,10ème - Rue La Fayette,Platform,Train Station,Train,High School,University,Trade School,College & University,Middle School,College Classroom,Hospital
2,10ème - Rue de Dunkerque - Gare du Nord,Platform,Train,Train Station,High School,Hospital,Trade School,Middle School,University,College & University,College Classroom
3,10ème - Rue du Faubourg Saint Martin,Trade School,Platform,College Academic Building,Train Station,Hospital,University,General College & University,Elementary School,College Classroom,College Cafeteria
4,11ème - Avenue Ledru Rollin,Elementary School,College Arts Building,Music School,College Library,Cooking School,College Academic Building,Middle School,Private School,College Stadium,College Math Building


### K-Means Clustering

In [81]:
# Set number of clusters
kclusters = 20
paris_grouped_clustering = paris_grouped.drop('Neighbourhood', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(paris_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# Add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [82]:
neighbourhoods_venues_sorted

Unnamed: 0,Cluster Labels,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,3,10ème - Boulevard de Magenta,Platform,Trade School,High School,Train Station,Middle School,College & University,College Arts Building,University,Medical School,Nursery School
1,3,10ème - Rue La Fayette,Platform,Train Station,Train,High School,University,Trade School,College & University,Middle School,College Classroom,Hospital
2,3,10ème - Rue de Dunkerque - Gare du Nord,Platform,Train,Train Station,High School,Hospital,Trade School,Middle School,University,College & University,College Classroom
3,6,10ème - Rue du Faubourg Saint Martin,Trade School,Platform,College Academic Building,Train Station,Hospital,University,General College & University,Elementary School,College Classroom,College Cafeteria
4,0,11ème - Avenue Ledru Rollin,Elementary School,College Arts Building,Music School,College Library,Cooking School,College Academic Building,Middle School,Private School,College Stadium,College Math Building
...,...,...,...,...,...,...,...,...,...,...,...,...
131,5,9ème - Rue Vignon,Fraternity House,High School,Elementary School,College Communications Building,College Arts Building,Student Center,Cooking School,General College & University,Train Station,Dance Studio
132,14,9ème - Rue d'Amsterdam,School,Nursery School,Train Station,High School,Hospital,University,College Academic Building,College Arts Building,College Cafeteria,College Classroom
133,14,9ème - Rue de Clichy,School,High School,Hospital,Middle School,Driving School,Elementary School,College Library,Music School,Nursery School,Dance Studio
134,5,9ème - Rue du Delta,High School,University,Office,Café,College & University,College Academic Building,Cooking School,Dance Studio,Driving School,Hospital


### Creating a grand dataframe containing the necessary data


In [86]:
paris_merged = explore_df
paris_merged = paris_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
paris_merged.dropna(inplace = True)
paris_merged['Cluster Labels'] = paris_merged['Cluster Labels'].astype(int)
paris_merged.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Institut Fassaha,School,48.860404,2.374422,2,General College & University,School,Elementary School,College Arts Building,College Gym,Dance Studio,Cooking School,Language School,Medical Center,Driving School
1,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École Maternelle Alphonse Baudin,School,48.86122,2.37011,2,General College & University,School,Elementary School,College Arts Building,College Gym,Dance Studio,Cooking School,Language School,Medical Center,Driving School
2,11ème - Boulevard Richard Lenoir,48.857749,2.371654,Cabinet de psychotherapie Emmanuelle LECOMTE,Medical Center,48.85367,2.372561,2,General College & University,School,Elementary School,College Arts Building,College Gym,Dance Studio,Cooking School,Language School,Medical Center,Driving School
3,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École Nationale Supérieure de Création Industr...,General College & University,48.858423,2.369635,2,General College & University,School,Elementary School,College Arts Building,College Gym,Dance Studio,Cooking School,Language School,Medical Center,Driving School
4,11ème - Boulevard Richard Lenoir,48.857749,2.371654,École élémentaire de la rue Froment,Elementary School,48.857646,2.372632,2,General College & University,School,Elementary School,College Arts Building,College Gym,Dance Studio,Cooking School,Language School,Medical Center,Driving School


### Visualizing the clusters


In [89]:
# Create map
map_clusters = folium.Map(location=[paris_lat, paris_lng], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(paris_merged['Venue Latitude'], paris_merged['Venue Longitude'], paris_merged['Venue Name'], paris_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' (Cluster ' + str(cluster) + ')', parse_html=True)
    map_clusters.add_child(
        folium.features.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7))
       
map_clusters