# Final Project - Battle of Neigh.
by Frank Ygnacio Rosas

### i. Import main libraries

In [92]:
#importing base libraries 
#to get data
import requests
#for scraping
from bs4 import BeautifulSoup
#base python libraries
import pandas as pd
import numpy as np

In [93]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np # library to handle data in a vectorized manner

In [94]:
import re
import urllib

### Part 1 |  Scrapping information - General Lima DF

#### 1.1. General Wikipedia URL Scrapping

In [95]:
#get the entire html of the url as a str
wikipedia_url = requests.get('https://es.wikipedia.org/wiki/Lima_Metropolitana').text

In [96]:
lima_data = BeautifulSoup(wikipedia_url, 'html.parser') #text to html

In [97]:
info_table = lima_data.find('table', class_ = 'wikitable')
rows_selected = info_table.find_all('tr')

In [98]:
# extract the info ('Postcode', 'Borough', 'Neighbourhood') from the table
lima_info = []
for row in rows_selected:
    info = row.text.split('\n')[1:-1] # remove empty str (first and last items)
    lima_info.append(info)
    
lima_info[:]

[['Ubicación', 'Distrito', 'Población (habitantes )'],
 ['070701', 'Callao', '426\xa0649'],
 ['070702', 'Bellavista', '78\xa0489'],
 ['070703', 'Carmen de La Legua', '43\xa0156'],
 ['070704', 'La Perla', '64\xa0111'],
 ['070705', 'La Punta', '3955'],
 ['070706', 'Mi Perú', '52\xa0722'],
 ['070707', 'Ventanilla', '356\xa0040'],
 ['150101', 'Lima', '276\xa0861'],
 ['150102', 'Ancón', '43\xa0951'],
 ['150103', 'Ate', '638\xa0345'],
 ['150104', 'Barranco', '30\xa0698'],
 ['150105', 'Breña', '77\xa0291'],
 ['150106', 'Carabayllo', '305\xa0963'],
 ['150107', 'Cieneguilla', '47\xa0860'],
 ['150108', 'Chaclacayo', '44\xa0271'],
 ['150109', 'Chorrillos', '330\xa0483'],
 ['150110', 'Comas', '532\xa0403'],
 ['150111', 'El Agustino', '194\xa0474'],
 ['150112', 'Independencia', '220\xa0654'],
 ['150113', 'Jesús María', '73\xa0439'],
 ['150114', 'La Molina', '175\xa0237'],
 ['150115', 'La Victoria', '174\xa0958'],
 ['150116', 'Lince', '51\xa0054'],
 ['150117', 'Los Olivos', '377\xa0532'],
 ['150118'

In [99]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### 1.2. Setting Neigh. as Population ("población" in Spanish)

In [100]:
#table (lists of list) to df

lima_info[0][-1] = 'Poblacion'
lima_df = pd.DataFrame(lima_info[1:], columns=lima_info[0])

lima_df.head()

Unnamed: 0,Ubicación,Distrito,Poblacion
0,70701,Callao,426 649
1,70702,Bellavista,78 489
2,70703,Carmen de La Legua,43 156
3,70704,La Perla,64 111
4,70705,La Punta,3955


#### 1.3. Find latitude and longitude for each Lima city district

In [101]:
geolocator = Nominatim(user_agent="lima_explorer")

In [102]:
lima_df['Distrito'] = lima_df['Distrito'].apply(lambda x: "{}{}".format('Lima, ', x))
lima_df.head()

Unnamed: 0,Ubicación,Distrito,Poblacion
0,70701,"Lima, Callao",426 649
1,70702,"Lima, Bellavista",78 489
2,70703,"Lima, Carmen de La Legua",43 156
3,70704,"Lima, La Perla",64 111
4,70705,"Lima, La Punta",3955


In [103]:
lima_df['Major_Dist_Coord']= lima_df['Distrito'].apply(geolocator.geocode).apply(lambda x: (x.latitude, x.longitude))
lima_df[['Latitude', 'Longitude']] = lima_df['Major_Dist_Coord'].apply(pd.Series)
#remove column 'Major_Dist_Coord'
lima_df.drop(['Major_Dist_Coord'], axis=1, inplace=True)

In [104]:
#eliminate with space in poblacion data
lima_df['Poblacion'] = lima_df['Poblacion'].str.replace(" ","")
lima_df = lima_df.sort_values(['Poblacion'], axis=0, ascending = True)
lima_df.head(10)

Unnamed: 0,Ubicación,Distrito,Poblacion,Latitude,Longitude
36,150130,"Lima, San Borja",114 688,-12.096452,-76.99569
28,150122,"Lima, Pachacámac",131 037,-12.251097,-76.906592
42,150136,"Lima, San Miguel",138 226,-12.078656,-77.095283
44,150138,"Lima, Santa María del Mar",1638,-12.401403,-76.775465
34,150128,"Lima, Rímac",167 617,-12.020304,-77.035463
21,150115,"Lima, La Victoria",174 958,-12.073358,-77.016417
20,150114,"Lima, La Molina",175 237,-12.090177,-76.922338
29,150123,"Lima, Pucusana",17 340,-12.482092,-76.797453
17,150111,"Lima, El Agustino",194 474,-12.042052,-76.995714
45,150139,"Lima, Santa Rosa",19 047,-12.035851,-77.086616


#### 1.4. Find latitude and longitude of Lima City

In [105]:
#check the Lima City General Coordinates
address = 'Lima, Peru'
geolocator = Nominatim(user_agent="lima_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Lima City is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Lima City is -12.0621065, -77.0365256.


### Part 2 |  Listing and Visualizing venues per districts - General Lima Districts Map

#### 2.1. Main ID information and Foursquare Requirements

In [106]:
CLIENT_ID = 'AW532KHX12WPNYVJZ4GRG1HJ1SEOFY5H35LGLLWQQ24SZOZ0' # your Foursquare ID
CLIENT_SECRET = 'F5X1XRL32HAIJCNKOQ3ZADFOTK14AWJBWJCLEHF3LLQOVXSW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: AW532KHX12WPNYVJZ4GRG1HJ1SEOFY5H35LGLLWQQ24SZOZ0
CLIENT_SECRET:F5X1XRL32HAIJCNKOQ3ZADFOTK14AWJBWJCLEHF3LLQOVXSW


In [107]:
#Get the neighborhood's latitude and longitude values.

neighborhood_latitude = lima_df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = lima_df.loc[0, 'Longitude'] # neighborhood longitude value

In [108]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 1000 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=AW532KHX12WPNYVJZ4GRG1HJ1SEOFY5H35LGLLWQQ24SZOZ0&client_secret=F5X1XRL32HAIJCNKOQ3ZADFOTK14AWJBWJCLEHF3LLQOVXSW&v=20180605&ll=-12.00365435,-77.11924373751658&radius=1000&limit=100'

#### 2.2. Functions to get districts latitude and longitude inc. venues info

In [109]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [110]:
# function to repeat the exploring process to all the districts in Lima
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Localidad', 
                  'Localidad Latitude', 
                  'Localidad Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

In [111]:
# Use category id 4bf58dd8d48988d16c941735 to only get "food" venues in each district
lima_food_venues = getNearbyVenues(names=lima_df['Distrito'], 
                                     latitudes=lima_df['Latitude'],
                                     longitudes=lima_df['Longitude'],
                                     radius=radius, categoryIds='4d4b7105d754a06374d81259')
lima_food_venues.head()

Unnamed: 0,Localidad,Localidad Latitude,Localidad Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Lima, San Borja",-12.096452,-76.99569,La Nuez,-12.095357,-76.995255,Pastry Shop
1,"Lima, San Borja",-12.096452,-76.99569,Starbucks,-12.093799,-76.995626,Coffee Shop
2,"Lima, San Borja",-12.096452,-76.99569,Madam Tusan,-12.0905,-77.00382,Chinese Restaurant
3,"Lima, San Borja",-12.096452,-76.99569,Pescados Capitales,-12.090926,-77.00403,Seafood Restaurant
4,"Lima, San Borja",-12.096452,-76.99569,La Piccolina,-12.090351,-77.00398,Italian Restaurant


#### 2.3. Scrapping districts segmentation | Five zones

In [112]:
#extracting segmentation of each district in each zone
wikipedia_url_dist = requests.get('https://es.wikipedia.org/wiki/Categor%C3%ADa:Distritos_de_Lima').text
lima_data_dist = BeautifulSoup(wikipedia_url_dist, 'html.parser') #text to html
info_table_dist = lima_data_dist.find('table', class_ = 'nowraplinks')
links = info_table_dist.findAll('a')

In [113]:
districts_and_zone = []
for link in links:
    districts_and_zone.append(link.get('title'))
final_dist_zone = districts_and_zone[3:]
final_dist_zone[18]="Distrito de " + final_dist_zone[18]
final_dist_zone = [string.replace("(aún no redactado)","").replace(" (Lima)","") for string in final_dist_zone]

In [114]:
list_zones = [element for element in final_dist_zone if "Distrito" not in element]
list_index_zones = [final_dist_zone.index(i) for i in list_zones]
pair_index = list(zip(list_index_zones, list_index_zones[1:] + list_index_zones[:1])) 

In [115]:
distDict = {}
for pair in pair_index:
    if pair[1] == 0:
        pair = (pair[0],None)
    else: 
        pass
    distDict[final_dist_zone[pair[0]]]=final_dist_zone[pair[0]+1:pair[1]]

In [116]:
for key in distDict:
    distDict[key]=[val.replace("Distrito de ","").replace("Distrito del ","") for val in distDict[key]]

In [117]:
lima_food_venues["Distrito"]= lima_food_venues["Localidad"].apply(lambda x: x.replace("Lima, ",""))

In [118]:
# color per zone
colors_dict = dict(zip(list_zones,["red","blue","green","orange","yellow"]))

In [119]:
main_list_dfs = []
for key in distDict:
    tempList = distDict[key]
    tempDf = lima_food_venues.query('Distrito in @tempList')
    tempDf["Sector"] = key
    tempDf["sector_color"] = colors_dict[key]
    main_list_dfs.append(tempDf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [120]:
main_df = pd.concat(main_list_dfs)

In [121]:
main_df.head(7)

Unnamed: 0,Localidad,Localidad Latitude,Localidad Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Distrito,Sector,sector_color
166,"Lima, Rímac",-12.020304,-77.035463,Manka Mitu (Olla De Barro),-12.023671,-77.032493,Restaurant,Rímac,Centro de Lima,red
167,"Lima, Rímac",-12.020304,-77.035463,Purito Catacaos,-12.024344,-77.03626,Seafood Restaurant,Rímac,Centro de Lima,red
168,"Lima, Rímac",-12.020304,-77.035463,El Corralito,-12.028688,-77.03671,Fried Chicken Joint,Rímac,Centro de Lima,red
169,"Lima, Rímac",-12.020304,-77.035463,Wasabi & sushi express,-12.020053,-77.03206,Sushi Restaurant,Rímac,Centro de Lima,red
170,"Lima, Rímac",-12.020304,-77.035463,Chifa Hoo Wa,-12.026661,-77.034016,Chinese Restaurant,Rímac,Centro de Lima,red
171,"Lima, Rímac",-12.020304,-77.035463,Super Pollo,-12.026802,-77.034478,Fried Chicken Joint,Rímac,Centro de Lima,red
172,"Lima, Rímac",-12.020304,-77.035463,Chifa Paolin,-12.025936,-77.033539,Chinese Restaurant,Rímac,Centro de Lima,red


#### 2.4. Map of each venue by district

* North: orange
* Center (Mid): red
* East: green
* Modern: blue
* South: yellow

In [122]:
# create map of Lima using latitude and longitude values
map_sm = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label, color in zip(main_df['Venue Latitude'],
                                  main_df['Venue Longitude'],
                                  main_df['Sector'],
                                  main_df["sector_color"]):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sm)  
    
map_sm

In case you cannot see the picture:

In [128]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://i.ibb.co/Pz1v5Mr/Screenshot-3.png")

### Part 3 | Exploring Results

Percentage of each venue category per lima sector.

In [123]:
df_multiindex = main_df.groupby(['Sector', 'Venue Category']).size().to_frame("cant")

In [124]:
df_grouped=df_multiindex.groupby(level = "Sector")

In [125]:
#selection top N venues for each sector | proportion
top = 3
for zone in list_zones:
    df_result=df_grouped.get_group(zone).sort_values(by="cant",ascending=False)
    pct_venue_cat_per_zone = df_result/df_result.sum()
    pct_venue_cat_per_zone['cant'] = pct_venue_cat_per_zone['cant'].astype(float).map("{:.2%}".format)
    print(pct_venue_cat_per_zone.head(top),"| Total:",df_result.sum().values[0])

                                      cant
Sector         Venue Category             
Centro de Lima Seafood Restaurant   12.10%
               Peruvian Restaurant  10.08%
               Restaurant            8.87% | Total: 248
                                        cant
Sector            Venue Category            
Lima Residencial  Seafood Restaurant   9.08%
                  Peruvian Restaurant  8.55%
                  Chinese Restaurant   7.50% | Total: 573
                                         cant
Sector            Venue Category             
Cono Este de Lima Peruvian Restaurant  13.33%
                  Seafood Restaurant   10.67%
                  Burger Joint         10.67% | Total: 75
                                          cant
Sector             Venue Category             
Cono Norte de Lima Restaurant           12.38%
                   Peruvian Restaurant  11.43%
                   Fried Chicken Joint  10.95% | Total: 210
                                        cant