In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import json
import ijson
import matplotlib.pyplot as plt
import matplotlib.path as mplPath
import seaborn as sns
import folium
from folium.plugins import HeatMap
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
edh = pd.read_csv('edh_with_d.csv', sep=';', encoding='utf-8')

In [3]:
ed =  r'EDH_community_councils.geojson'
geo_json_data_edh = json.load(open(ed))

In [4]:
# Creamos la corresondencia entre Distritos y su media de estrellas de rating
dict_stars_by_district = edh.groupby('district')['stars'].mean()
dict_stars_by_district

district
Colinton                                 3.583333
Corstorphine                             3.676471
Craigentinny/Meadowbank                  3.437500
Craigleith/Blackhall                     3.909091
Craiglockhart                            3.750000
Craigmillar                              3.722222
Cramond and Barnton                      3.250000
Currie                                   4.000000
Drum Brae                                4.333333
Fairmilehead                             3.500000
Firrhill                                 3.750000
Gilmerton/Inch                           3.571429
Gorgie/Dalry                             3.864706
Grange/Prestonfield                      3.795918
Granton and District                     3.500000
Hutchison/Chesser                        3.437500
Juniper Green                            4.000000
Leith Central                            3.862805
Leith Harbour and Newhaven               3.898148
Leith Links                              

In [5]:
centers = edh.groupby('district')[['longitude', 'latitude']].mean()

In [6]:
# Procedemos ahora sí a visualizar por colores según el rating de cada district
map_edh = folium.Map(location=[edh['latitude'].mean(),edh['longitude'].mean()], zoom_start=11)

map_edh.choropleth(
    geo_str=geo_json_data_edh,
    data=dict_stars_by_district,
    columns=['stars'],
    key_on='properties.LABEL',
    fill_color='RdYlGn',
    threshold_scale=[3, 3.5, 4, 4.5],
    fill_opacity=0.8,
    line_opacity=0.8,
    legend_name='Mean of Stars per District'
)

for i in range(len(centers)):
    if centers.iloc[i].name == 'None':
        pass
    else:
        folium.Marker([centers.iloc[i][1], centers.iloc[i][0]], 
                  popup=str(centers.index[i]), icon=folium.Icon(color='black')).add_to(map_edh)

map_edh

In [7]:
# Creamos la corresondencia entre Distritos y la cantidad de locales de cada uno
dict_bizcount_by_district = edh.groupby('district')['business_id'].count()

In [8]:
map_edh = folium.Map(location=[edh['latitude'].mean(),edh['longitude'].mean()], zoom_start=11)

map_edh.choropleth(
    geo_str=geo_json_data_edh,
    data=dict_bizcount_by_district,
    columns=['business_id'],
    key_on='properties.LABEL',
    fill_color='YlOrRd',
    #threshold_scale=[3, 3.5, 4, 4.5],
    fill_opacity=0.8,
    line_opacity=0.8,
    legend_name='Business per District'
)

for i in range(len(centers)):
    if centers.iloc[i].name == 'None':
        pass
    else:
        folium.Marker([centers.iloc[i][1], centers.iloc[i][0]], 
                  popup=str(centers.index[i]), icon=folium.Icon(color='black')).add_to(map_edh)

map_edh

In [9]:
# Media de estrellas en los distritos con más de 100 locales
edh.groupby('district')['stars'].mean()[edh.groupby('district').size() > 100].sort_values(ascending=False)

district
Stockbridge/Inverleith        4.025000
Merchiston                    3.913793
Leith Harbour and Newhaven    3.898148
Leith Central                 3.862805
Tollcross                     3.839590
Southside                     3.802521
Old Town                      3.787464
New Town/Broughton            3.738160
West End                      3.713068
Name: stars, dtype: float64

In [10]:
edh.groupby('district')['stars'].mean()[edh.groupby('district').size() > 100].index

Index([u'Leith Central', u'Leith Harbour and Newhaven', u'Merchiston',
       u'New Town/Broughton', u'Old Town', u'Southside',
       u'Stockbridge/Inverleith', u'Tollcross', u'West End'],
      dtype='object', name=u'district')

In [11]:
for i in edh.groupby('district')['stars'].mean()[edh.groupby('district').size() > 100].index:
    dis = edh[edh['district'] == i]
    print 'district: ', i
    print 'Double categories distribution'
    print (dis.groupby('cats')['business_id'].count()).sort_values(ascending=False).head(10)

district:  Leith Central
Double categories distribution
cats
[u'Restaurants', u'Fast Food']        10
[u'Nightlife', u'Pubs']                9
[u'Nightlife', u'Bars']                5
[u'Restaurants', u'Indian']            5
[u'Food', u'Grocery']                  4
[u'Restaurants', u'Italian']           4
[u'Beer', u'Wine & Spirits']           3
[u'Coffee & Tea', u'Restaurants']      3
[u'Beauty & Spas', u'Hair Salons']     3
[u'Food', u'Bakeries']                 3
Name: business_id, dtype: int64
district:  Leith Harbour and Newhaven
Double categories distribution
cats
[u'Shopping', u'Fashion']         8
[u'Nightlife', u'Bars']           6
[u'Food', u'Coffee & Tea']        6
[u'Restaurants', u'Seafood']      5
[u'Restaurants', u'Italian']      5
[u'Restaurants', u'Fast Food']    4
[u'Restaurants', u'Indian']       4
[u'Restaurants', u'French']       4
[u'Restaurants', u'British']      4
[u'Restaurants', u'Chinese']      3
Name: business_id, dtype: int64
district:  Merchiston
Double ca

In [12]:
# Tipos de Restaurantes y cantidad de los mismos (subcategorías)
edh[edh['main_cat'] == 'Restaurants'].groupby('subcat')['business_id'].count()[edh[edh['main_cat'] == 'Restaurants'].groupby('subcat').size() > 13].sort_values(ascending=False)

subcat
Italian                   88
Fast Food                 78
Chinese                   57
Indian                    57
British                   54
Cafes                     43
Thai                      27
French                    27
Fish & Chips              25
Sandwiches                22
Food                      20
Pizza                     20
Seafood                   16
Delis                     16
Burgers                   16
American (Traditional)    16
Breakfast & Brunch        14
Bars                      14
Name: business_id, dtype: int64

In [13]:
resto = list(edh[edh['main_cat'] == 'Restaurants'].groupby('subcat')['business_id'].count()[edh[edh['main_cat'] == 'Restaurants'].groupby('subcat').size() > 13].index)

In [14]:
resto

[u'American (Traditional)',
 u'Bars',
 u'Breakfast & Brunch',
 u'British',
 u'Burgers',
 u'Cafes',
 u'Chinese',
 u'Delis',
 u'Fast Food',
 u'Fish & Chips',
 u'Food',
 u'French',
 u'Indian',
 u'Italian',
 u'Pizza',
 u'Sandwiches',
 u'Seafood',
 u'Thai']

In [15]:
#col = ['#b3e2cd','#fdcdac','#cbd5e8','#66c2a5','#fc8d62','#8da0cb','#fc8d59','#ffffbf','#91cf60','#fc8d59','#a1d76a',
#       '#91bfdb','#f1a340','#f7f7f7','#998ec3','#af8dc3','#f7f7f7','#7fbf7b','#d8b365','#f5f5f5','#5ab4ac','#e9a3c9',
#       '#f7f7f7']

In [16]:
# Generamos una lista de colores de longitud igual a la lista de subcategorías para el mapeo
c = ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
 'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
 'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen',
 'gray', 'black']

In [17]:
len(resto)

18

In [18]:
restocol = [[resto[i],c[i]] for i in range(len(resto))]

In [19]:
dict(restocol).keys()

[u'Bars',
 u'Chinese',
 u'Burgers',
 u'American (Traditional)',
 u'Seafood',
 u'Cafes',
 u'Sandwiches',
 u'British',
 u'Breakfast & Brunch',
 u'Delis',
 u'Food',
 u'French',
 u'Fast Food',
 u'Indian',
 u'Thai',
 u'Pizza',
 u'Fish & Chips',
 u'Italian']

In [20]:
map_edh = folium.Map(location=[edh['latitude'].mean(),edh['longitude'].mean()], zoom_start=12)

for i in range(len(edh[edh['main_cat'] == 'Restaurants'])):
    if edh[edh['main_cat'] == 'Restaurants']['subcat'].iloc[i] in dict(restocol).keys():
        color = dict(restocol)[edh[edh['main_cat'] == 'Restaurants']['subcat'].iloc[i]]
        folium.Marker([edh[edh['main_cat'] == 'Restaurants']['latitude'].iloc[i], 
                       edh[edh['main_cat'] == 'Restaurants']['longitude'].iloc[i]], 
                      popup=str(edh[edh['main_cat'] == 'Restaurants']['subcat'].iloc[i]), 
                      icon=folium.Icon(color=color)).add_to(map_edh)

folium.GeoJson(geo_json_data_edh,
    style_function=lambda feature: {
        'fillColor': '#fff7bc',
        'color': 'black',
        'weight': 2,
        'dashArray': '5, 5'
    }).add_to(map_edh)

map_edh

In [None]:
# No se aprecian agrupaciones de restaurantes atendiendo a la subcategoría

In [21]:
dict(restocol)

{u'American (Traditional)': 'red',
 u'Bars': 'blue',
 u'Breakfast & Brunch': 'green',
 u'British': 'purple',
 u'Burgers': 'orange',
 u'Cafes': 'darkred',
 u'Chinese': 'lightred',
 u'Delis': 'beige',
 u'Fast Food': 'darkblue',
 u'Fish & Chips': 'darkgreen',
 u'Food': 'cadetblue',
 u'French': 'darkpurple',
 u'Indian': 'white',
 u'Italian': 'pink',
 u'Pizza': 'lightblue',
 u'Sandwiches': 'lightgreen',
 u'Seafood': 'gray',
 u'Thai': 'black'}

In [8]:
edh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3539 entries, 0 to 3538
Data columns (total 60 columns):
AcceptsInsurance              17 non-null object
AgesAllowed                   19 non-null object
Alcohol                       1197 non-null object
Ambience                      948 non-null object
BYOB                          0 non-null float64
BYOBCorkage                   1 non-null object
BestNights                    259 non-null object
BikeParking                   1235 non-null object
BusinessAcceptsBitcoin        104 non-null object
BusinessAcceptsCreditCards    2488 non-null object
BusinessParking               1989 non-null object
ByAppointmentOnly             220 non-null object
Caters                        431 non-null object
CoatCheck                     377 non-null object
Corkage                       0 non-null float64
DietaryRestrictions           53 non-null object
DogsAllowed                   218 non-null object
DriveThru                     110 non-null obj