## Project Code Notebook for the Project

In [80]:
# importing the necessary libraries for this project as shown below

import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
#import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe

import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

### Scrapping the Contents of the wikipedia page containing the table using BeautifulSoup as shown below

In [81]:
from urllib.request import urlopen as uReg
source = uReg("https://en.wikipedia.org/wiki/Lagos_State") #wikipedia page to be scrapped
page_html = source.read()
source.close()
soup = BeautifulSoup(page_html, 'html.parser') #loading the entire wikipedia page for the desired data
table = soup.findAll("table") #getting the table portion of the loaded wikipedia page 
#print (soup.prettify())
table = table[2]
#table
 

In [82]:
table_rows = table.tbody.find_all("tr") #getting all the rows from the table

# now, I will iterate through the whole table I have scrapped as shown below

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [l.text for l in td]
    if row != []:
        res.append(row)

# Dataframe with 5 columns
pd.set_option('display.max_rows',None)
df = pd.DataFrame(res, columns = ["LGA name","Area(KM SQUARE)","CENSUS 2006 POPULATION","ADMINISTRATIVE CAPITAL","POSTAL CODE"])
df

Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE
0,Agege\n,11\n,"459,939\n",Agege\n,100\n
1,Alimosho\n,185\n,"1,277,714\n",Ikotun\n,100\n
2,Ifako-Ijaye\n,27\n,"427,878\n",Ifako\n,100\n
3,Ikeja\n,46\n,"313,196\n",Ikeja\n,100\n
4,Kosofe\n,81\n,"665,393\n",Kosofe\n,100\n
5,Mushin\n,17\n,"633,009\n",Mushin\n,100\n
6,Oshodi-Isolo\n,45\n,"621,509\n",Oshodi/Isolo\n,100\n
7,Shomolu\n,12\n,"402,673\n",Shomolu\n,101\n
8,Ikeja Division\n,424\n,"4,801,311\n",\n,\n
9,Apapa\n,27\n,"217,362\n",Apapa\n,101\n


### Data Preparation and Data Cleaning

In [83]:
#removing the trailing white space from the above dataframe as shown below

df['LGA name'] = df['LGA name'].str.replace("\n","")
df['Area(KM SQUARE)'] = df['Area(KM SQUARE)'].str.replace("\n","")
df['CENSUS 2006 POPULATION'] = df['CENSUS 2006 POPULATION'].str.replace("\n","")
df['ADMINISTRATIVE CAPITAL'] = df['ADMINISTRATIVE CAPITAL'].str.replace("\n","")
df['POSTAL CODE'] = df['POSTAL CODE'].str.replace("\n","")
df.head()

Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE
0,Agege,11,459939,Agege,100
1,Alimosho,185,1277714,Ikotun,100
2,Ifako-Ijaye,27,427878,Ifako,100
3,Ikeja,46,313196,Ikeja,100
4,Kosofe,81,665393,Kosofe,100


#### All the neighbourhoods with the same postal codes should be grouped together as shown below

In [84]:
df_new = df.groupby(["POSTAL CODE"])["LGA name"].apply(", ".join).reset_index()
df_new

Unnamed: 0,POSTAL CODE,LGA name
0,,"Ikeja Division, Lagos Division, Badagry Divisi..."
1,100.0,"Agege, Alimosho, Ifako-Ijaye, Ikeja, Kosofe, M..."
2,101.0,"Shomolu, Apapa, Eti-Osa, Lagos Island, Lagos M..."
3,102.0,"Ajeromi-Ifelodun, Amuwo-Odofin, Ojo"
4,103.0,Badagry
5,104.0,Ikorodu
6,105.0,Ibeju-Lekki
7,106.0,Epe


In [85]:

df_new = df_new.drop([df.index[0]]).reset_index()
df_new

Unnamed: 0,index,POSTAL CODE,LGA name
0,1,100,"Agege, Alimosho, Ifako-Ijaye, Ikeja, Kosofe, M..."
1,2,101,"Shomolu, Apapa, Eti-Osa, Lagos Island, Lagos M..."
2,3,102,"Ajeromi-Ifelodun, Amuwo-Odofin, Ojo"
3,4,103,Badagry
4,5,104,Ikorodu
5,6,105,Ibeju-Lekki
6,7,106,Epe


In [86]:
df_new.drop("index", axis = 1,inplace=True)
df_new

Unnamed: 0,POSTAL CODE,LGA name
0,100,"Agege, Alimosho, Ifako-Ijaye, Ikeja, Kosofe, M..."
1,101,"Shomolu, Apapa, Eti-Osa, Lagos Island, Lagos M..."
2,102,"Ajeromi-Ifelodun, Amuwo-Odofin, Ojo"
3,103,Badagry
4,104,Ikorodu
5,105,Ibeju-Lekki
6,106,Epe


#### 3. Get the latitude and the longitude coordinates of each neighborhood.

##### Getting the geographical coordinates of the neighbourhoods using the csv file (Check attached link on project report) instead of using the geocoder package as shown below. I make use of pandas to read the provided csv file.

In [87]:
import pandas as pd
geo_df = pd.read_excel("Lagos Coordinates.xlsx")
pd.set_option("display.max_rows",None)
geo_df

Unnamed: 0,Local Govt,latitude,longitude
0,Agege,6.618,3.3209
1,Alimosho,6.5744,3.257
2,Ifako-Ijaye,6.685,3.2885
3,Ikeja,6.6018,3.3515
4,Kosofe,6.5691,3.3793
5,Mushin,6.5273,3.3414
6,Oshodi-Isolo,6.5355,3.3087
7,Shomolu,6.5392,3.3842
8,Apapa,6.4553,3.3641
9,Eti-Osa,6.459,3.6015


#### Now, I will merge two of the dataframes i.e the df and geo_df as a one dataframe as shown below. I want to retain all the information in the df dataframe and only part of the information in the geo_df dataframe that matches a given condition using a left join

In [88]:
lagos_complete_df = pd.merge(df,geo_df,how='left',left_on ='LGA name',right_on = 'Local Govt')
lagos_complete_df.head()

Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE,Local Govt,latitude,longitude
0,Agege,11,459939,Agege,100,Agege,6.618,3.3209
1,Alimosho,185,1277714,Ikotun,100,Alimosho,6.5744,3.257
2,Ifako-Ijaye,27,427878,Ifako,100,Ifako-Ijaye,6.685,3.2885
3,Ikeja,46,313196,Ikeja,100,Ikeja,6.6018,3.3515
4,Kosofe,81,665393,Kosofe,100,Kosofe,6.5691,3.3793


In [89]:
# The column 'Postal Code need to be dropped completely from the above dataframe as shown below'

lagos_complete_df.drop('Local Govt',axis=1,inplace=True)
lagos_complete_df

Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE,latitude,longitude
0,Agege,11,459939,Agege,100.0,6.618,3.3209
1,Alimosho,185,1277714,Ikotun,100.0,6.5744,3.257
2,Ifako-Ijaye,27,427878,Ifako,100.0,6.685,3.2885
3,Ikeja,46,313196,Ikeja,100.0,6.6018,3.3515
4,Kosofe,81,665393,Kosofe,100.0,6.5691,3.3793
5,Mushin,17,633009,Mushin,100.0,6.5273,3.3414
6,Oshodi-Isolo,45,621509,Oshodi/Isolo,100.0,6.5355,3.3087
7,Shomolu,12,402673,Shomolu,101.0,6.5392,3.3842
8,Ikeja Division,424,4801311,,,,
9,Apapa,27,217362,Apapa,101.0,6.4553,3.3641


In [90]:
lagos_complete_df = lagos_complete_df.drop([8,14,19,21,24,25], axis=0)
lagos_complete_df.reset_index(inplace=True)
lagos_complete_df.drop("index",axis=1,inplace = True)
lagos_complete_df

Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE,latitude,longitude
0,Agege,11,459939,Agege,100,6.618,3.3209
1,Alimosho,185,1277714,Ikotun,100,6.5744,3.257
2,Ifako-Ijaye,27,427878,Ifako,100,6.685,3.2885
3,Ikeja,46,313196,Ikeja,100,6.6018,3.3515
4,Kosofe,81,665393,Kosofe,100,6.5691,3.3793
5,Mushin,17,633009,Mushin,100,6.5273,3.3414
6,Oshodi-Isolo,45,621509,Oshodi/Isolo,100,6.5355,3.3087
7,Shomolu,12,402673,Shomolu,101,6.5392,3.3842
8,Apapa,27,217362,Apapa,101,6.4553,3.3641
9,Eti-Osa,192,287785,Ikoyi,101,6.459,3.6015


## Methodology

#### Exploratory Data Analysis

##### Lets visualize the different local government areas in Lagos state with respect to their populations

In [91]:
#lets import the necessary library as shown below

import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
%matplotlib inline

import plotly
import plotly.express as px
import plotly.graph_objects as go
plt.rcParams['figure.figsize']=17,8
import cufflinks as cf
import plotly.offline as pyo
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
pyo.init_notebook_mode(connected=True)
cf.go_offline()

In [92]:
#lets visualize the local governments with respect to their population using bar chart as shown below
lagos_complete_df.iplot(kind='bar',x='LGA name',y='CENSUS 2006 POPULATION', title="Lagos LGA Populations",
                        xTitle="Lagos State Local Govt Areas",yTitle="Population Density",colors = "blue")


*just hover on each of the bar above and you can see that Alimosho has the highest population. This might be a good place for setting our business considering the high population density though we can't conclude yet until we have done futher analysis. just an overview*

In [93]:

lagos_complete_df['Area(KM SQUARE)'] = lagos_complete_df['Area(KM SQUARE)'].str.replace(',','')
lagos_complete_df['CENSUS 2006 POPULATION'] = lagos_complete_df['CENSUS 2006 POPULATION'].str.replace(',','')
lagos_complete_df['Area(KM SQUARE)'] = pd.to_numeric(lagos_complete_df['Area(KM SQUARE)'])
lagos_complete_df['CENSUS 2006 POPULATION'] = pd.to_numeric(lagos_complete_df['CENSUS 2006 POPULATION'])
lagos_complete_df.head()


Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE,latitude,longitude
0,Agege,11,459939,Agege,100,6.618,3.3209
1,Alimosho,185,1277714,Ikotun,100,6.5744,3.257
2,Ifako-Ijaye,27,427878,Ifako,100,6.685,3.2885
3,Ikeja,46,313196,Ikeja,100,6.6018,3.3515
4,Kosofe,81,665393,Kosofe,100,6.5691,3.3793


In [94]:
df_area = lagos_complete_df.loc[:,'LGA name':'Area(KM SQUARE)']
df_area.iplot(kind='area',x='LGA name',y='Area(KM SQUARE)', title="Lagos LGA LandMass",
                        xTitle="Lagos State Local Govt Areas",yTitle="Land Area in KM SQUARE",colors = "red")



* from the above area plot, we can see that Epe Local govt has the highest landmass followed by Ibeju-Lekki. This in a way suggests that land for erecting buildings in Epe might be cheaper compared to other areas where the landmass is not much if all things being equal. well, we can't conclude with utmost assurance now until we do futher analysis but this can be a hint of the overall scenarios.

### Exploring and Clustering the neighbourhoods in Lagos as shown below

In [95]:
#using geolocator to get the coordinate of lagos state Nigeria
address = 'Lagos, NG-LA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Lagos City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Lagos City are 6.4550575, 3.3941795.


#### Creating a map of Lagos state  with neighborhood areas  superimposed on top.

In [97]:
# first, I will create map of Lagos using its latitude and longitude values before adding marker as shown below
lagos_map = folium.Map(location=[latitude, longitude], zoom_start=10)
lagos_map

### adding markers to the Lagos State map above as shown below

In [98]:
for lat, lng, lga, area,census,admin in zip(
        lagos_complete_df['latitude'], 
        lagos_complete_df['longitude'], 
        lagos_complete_df['LGA name'],
        lagos_complete_df['Area(KM SQUARE)'],
        lagos_complete_df['CENSUS 2006 POPULATION'],
        lagos_complete_df['ADMINISTRATIVE CAPITAL']):
    label = '{}, {}'.format(lga, area,census,admin)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(lagos_map)  

lagos_map

### Exploring the first neighbourhood in the dataframe, lagos_complete_df as shown below


In [99]:
first_neighborhood_name = lagos_complete_df.loc[12, 'LGA name']
print(f"The first neighborhood name is '{first_neighborhood_name}'.")

The first neighborhood name is 'Surulere'.


In [100]:
neighborhood_latitude = lagos_complete_df.loc[12, 'latitude'] # neighborhood latitude value
neighborhood_longitude = lagos_complete_df.loc[12, 'longitude'] # neighborhood longitude value

neighborhood_name = lagos_complete_df.loc[12, 'LGA name'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Surulere are 6.4983, 3.3486.


#### Now, let's get the top 100 venues that are in The Surulere within a radius of 500 meters.

In [208]:
CLIENT_ID = '' # my Foursquare ID which I used but cleared it before posting for privacy purpose
CLIENT_SECRET = '' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

In [209]:
results = requests.get(url).json()

#### Writing a function that extracts the categories of the data


In [210]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [211]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Emem's Haven,African Restaurant,6.497461,3.349001
1,Nwandos Signature,Bridal Shop,6.49785,3.349907
2,Busy Bees,Bar,6.496751,3.3468
3,Bar 16,Bar,6.500071,3.346901
4,Mr Biggs Ijeshatedo,Fast Food Restaurant,6.499886,3.350667
5,Chicken Republic,Fast Food Restaurant,6.495759,3.349176
6,MeddySpa,Spa,6.495132,3.34818
7,Momentum,Lounge,6.501848,3.349575
8,Finicky,Cafeteria,6.501049,3.345378
9,Canice Hans Pharmacy,Pharmacy,6.502389,3.347469


##  Explore Neighborhoods in Surulere Lagos

#### Let's create a function to repeat the same process to all the neighborhoods in Lagos

In [212]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [213]:
lagos_complete_df_venue = getNearbyVenues(names=lagos_complete_df['LGA name'],
                                   latitudes=lagos_complete_df['latitude'],
                                   longitudes=lagos_complete_df['longitude'],
                                   
                                  )
lagos_complete_df_venue.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Agege,6.618,3.3209,KFC,6.620788,3.317968,Fast Food Restaurant
1,Agege,6.618,3.3209,Kelani St,6.618931,3.317609,Park
2,Alimosho,6.5744,3.257,Joycelyn ice cream,6.571974,3.256003,Food Truck
3,Alimosho,6.5744,3.257,7 Days Inn,6.574307,3.260634,Bar
4,Ikeja,6.6018,3.3515,Goat Hunters,6.601134,3.351368,African Restaurant


#### Let's check the size of the resulting dataframe


In [214]:
lagos_complete_df_venue.shape[0]

38

In [215]:
lagos_complete_df_venue.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agege,2,2,2,2,2,2
Alimosho,2,2,2,2,2,2
Apapa,1,1,1,1,1,1
Ikeja,4,4,4,4,4,4
Ikorodu,2,2,2,2,2,2
Kosofe,2,2,2,2,2,2
Lagos Island,4,4,4,4,4,4
Lagos Mainland,8,8,8,8,8,8
Mushin,2,2,2,2,2,2
Oshodi-Isolo,1,1,1,1,1,1


## 3. Analyze Each Neighborhood

In [216]:
# one hot encoding
lagos_complete_df_venue_onehot = pd.get_dummies(lagos_complete_df_venue[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
lagos_complete_df_venue_onehot['Neighborhood'] = lagos_complete_df_venue['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [lagos_complete_df_venue_onehot.columns[-1]] + list(lagos_complete_df_venue_onehot.columns[:-1])
lagos_complete_df_venue_onehot = lagos_complete_df_venue_onehot[fixed_columns]

lagos_complete_df_venue_onehot.head()


Unnamed: 0,Neighborhood,African Restaurant,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant,Food Truck,Hotel,Hotel Bar,Lounge,Movie Theater,Nightclub,Park,Pharmacy,Pool Hall,Residential Building (Apartment / Condo),Shopping Mall,Spa
0,Agege,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,Agege,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,Alimosho,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,Alimosho,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Ikeja,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [217]:
lagos_complete_df_venue_onehot_grouped = lagos_complete_df_venue_onehot.groupby('Neighborhood').mean().reset_index()
lagos_complete_df_venue_onehot_grouped.head()

Unnamed: 0,Neighborhood,African Restaurant,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant,Food Truck,Hotel,Hotel Bar,Lounge,Movie Theater,Nightclub,Park,Pharmacy,Pool Hall,Residential Building (Apartment / Condo),Shopping Mall,Spa
0,Agege,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
1,Alimosho,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apapa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Ikeja,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
4,Ikorodu,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's confirm the new size

In [218]:
lagos_complete_df_venue_onehot_grouped.shape

(11, 21)

#### Check the 10 most common venues in each neighborhood.

In [219]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = lagos_complete_df_venue_onehot_grouped['Neighborhood']

for ind in np.arange(lagos_complete_df_venue_onehot_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(lagos_complete_df_venue_onehot_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agege,Park,Fast Food Restaurant,Spa,Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
1,Alimosho,Bar,Food Truck,Spa,Shopping Mall,Bakery,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant
2,Apapa,Fast Food Restaurant,Spa,Shopping Mall,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Food Truck
3,Ikeja,African Restaurant,Bakery,Pharmacy,Fast Food Restaurant,Food Truck,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
4,Ikorodu,Bar,Candy Store,Spa,Shopping Mall,Bakery,Bridal Shop,Cafeteria,Department Store,Fast Food Restaurant,Food Truck


##  Cluster Neighborhoods

In [220]:

# set number of clusters
kclusters = 5

lagos_complete_df_venue_onehot_grouped_clustering = lagos_complete_df_venue_onehot_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(lagos_complete_df_venue_onehot_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 4, 2, 3, 4, 0, 3, 3, 4, 1])

In [221]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

lagos_complete_df_merged = lagos_complete_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
lagos_complete_df_merged = lagos_complete_df_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='LGA name')

lagos_complete_df_merged.head() # check the last columns!

Unnamed: 0,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agege,11,459939,Agege,100,6.618,3.3209,2.0,Park,Fast Food Restaurant,Spa,Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
1,Alimosho,185,1277714,Ikotun,100,6.5744,3.257,4.0,Bar,Food Truck,Spa,Shopping Mall,Bakery,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant
2,Ifako-Ijaye,27,427878,Ifako,100,6.685,3.2885,,,,,,,,,,,
3,Ikeja,46,313196,Ikeja,100,6.6018,3.3515,3.0,African Restaurant,Bakery,Pharmacy,Fast Food Restaurant,Food Truck,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
4,Kosofe,81,665393,Kosofe,100,6.5691,3.3793,0.0,Spa,Residential Building (Apartment / Condo),Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant


#### Lets drop the Cluster Labels column containing NAN value to enable us visualize the clusters on a map

In [222]:
# Now, dropping all the rows with the NaNs
lagos_complete_df_merged = lagos_complete_df_merged[lagos_complete_df_merged['Cluster Labels'].notna()].reset_index()
lagos_complete_df_merged

Unnamed: 0,index,LGA name,Area(KM SQUARE),CENSUS 2006 POPULATION,ADMINISTRATIVE CAPITAL,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Agege,11,459939,Agege,100,6.618,3.3209,2.0,Park,Fast Food Restaurant,Spa,Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
1,1,Alimosho,185,1277714,Ikotun,100,6.5744,3.257,4.0,Bar,Food Truck,Spa,Shopping Mall,Bakery,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant
2,3,Ikeja,46,313196,Ikeja,100,6.6018,3.3515,3.0,African Restaurant,Bakery,Pharmacy,Fast Food Restaurant,Food Truck,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
3,4,Kosofe,81,665393,Kosofe,100,6.5691,3.3793,0.0,Spa,Residential Building (Apartment / Condo),Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant
4,5,Mushin,17,633009,Mushin,100,6.5273,3.3414,4.0,Bakery,Bar,Spa,Shopping Mall,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant,Food Truck
5,6,Oshodi-Isolo,45,621509,Oshodi/Isolo,100,6.5355,3.3087,1.0,Bakery,Spa,Shopping Mall,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant,Food Truck
6,8,Apapa,27,217362,Apapa,101,6.4553,3.3641,2.0,Fast Food Restaurant,Spa,Shopping Mall,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Food Truck
7,10,Lagos Island,9,209437,Lagos Island,101,6.4549,3.4246,3.0,Hotel,Pool Hall,Bar,Hotel Bar,Food Truck,Bakery,Bridal Shop,Cafeteria,Candy Store,Department Store
8,11,Lagos Mainland,19,317720,Lagos Mainland,101,6.5059,3.3781,3.0,Fast Food Restaurant,Nightclub,African Restaurant,Movie Theater,Department Store,Shopping Mall,Bakery,Bar,Bridal Shop,Cafeteria
9,12,Surulere,23,503975,Surulere,101,6.4983,3.3486,3.0,Bar,Fast Food Restaurant,Spa,Lounge,Bridal Shop,Cafeteria,African Restaurant,Pharmacy,Movie Theater,Nightclub


In [223]:
lagos_complete_df_merged = lagos_complete_df_merged.astype({"Cluster Labels":'int64'})
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(lagos_complete_df_merged['latitude'], lagos_complete_df_merged['longitude'], lagos_complete_df_merged['LGA name'], lagos_complete_df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Finally, let's Examine each of the clusters

### Cluster 1

In [224]:
lagos_complete_df_merged.loc[lagos_complete_df_merged['Cluster Labels'] == 0.0, lagos_complete_df_merged.columns[[1] + list(range(5, lagos_complete_df_merged.shape[1]))]]


Unnamed: 0,LGA name,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Kosofe,100,6.5691,3.3793,0,Spa,Residential Building (Apartment / Condo),Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant


### Cluster 2

In [225]:
lagos_complete_df_merged.loc[lagos_complete_df_merged['Cluster Labels'] == 1.0, lagos_complete_df_merged.columns[[1] + list(range(5, lagos_complete_df_merged.shape[1]))]]


Unnamed: 0,LGA name,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Oshodi-Isolo,100,6.5355,3.3087,1,Bakery,Spa,Shopping Mall,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant,Food Truck


### cluster 3

In [226]:
lagos_complete_df_merged.loc[lagos_complete_df_merged['Cluster Labels'] == 2.0, lagos_complete_df_merged.columns[[1] + list(range(5, lagos_complete_df_merged.shape[1]))]]


Unnamed: 0,LGA name,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agege,100,6.618,3.3209,2,Park,Fast Food Restaurant,Spa,Food Truck,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
6,Apapa,101,6.4553,3.3641,2,Fast Food Restaurant,Spa,Shopping Mall,Bakery,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store,Food Truck


### Cluster 4

In [227]:
lagos_complete_df_merged.loc[lagos_complete_df_merged['Cluster Labels'] == 3.0, lagos_complete_df_merged.columns[[1] + list(range(5, lagos_complete_df_merged.shape[1]))]]


Unnamed: 0,LGA name,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Ikeja,100,6.6018,3.3515,3,African Restaurant,Bakery,Pharmacy,Fast Food Restaurant,Food Truck,Bar,Bridal Shop,Cafeteria,Candy Store,Department Store
7,Lagos Island,101,6.4549,3.4246,3,Hotel,Pool Hall,Bar,Hotel Bar,Food Truck,Bakery,Bridal Shop,Cafeteria,Candy Store,Department Store
8,Lagos Mainland,101,6.5059,3.3781,3,Fast Food Restaurant,Nightclub,African Restaurant,Movie Theater,Department Store,Shopping Mall,Bakery,Bar,Bridal Shop,Cafeteria
9,Surulere,101,6.4983,3.3486,3,Bar,Fast Food Restaurant,Spa,Lounge,Bridal Shop,Cafeteria,African Restaurant,Pharmacy,Movie Theater,Nightclub


### Cluster 5

In [228]:
lagos_complete_df_merged.loc[lagos_complete_df_merged['Cluster Labels'] == 4.0, lagos_complete_df_merged.columns[[1] + list(range(5, lagos_complete_df_merged.shape[1]))]]


Unnamed: 0,LGA name,POSTAL CODE,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Alimosho,100,6.5744,3.257,4,Bar,Food Truck,Spa,Shopping Mall,Bakery,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant
4,Mushin,100,6.5273,3.3414,4,Bakery,Bar,Spa,Shopping Mall,Bridal Shop,Cafeteria,Candy Store,Department Store,Fast Food Restaurant,Food Truck
10,Ikorodu,104,6.6194,3.5105,4,Bar,Candy Store,Spa,Shopping Mall,Bakery,Bridal Shop,Cafeteria,Department Store,Fast Food Restaurant,Food Truck
