## Preliminaries

In [149]:
%matplotlib inline

In [247]:
from __future__ import unicode_literals, division

import IPython
import warnings
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from pylab import *
from IPython.display import HTML
from IPython.display import display as prnt
from random import sample

# Notebook Options
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

# Matplotlib Styles
c = {'axes.labelsize': 17,
'axes.titlesize': 16,
'figure.figsize': [18, 8],
'grid.linewidth': 1.6,
'legend.fontsize': 17,
'lines.linewidth': 2,
'lines.markeredgewidth': 0.0,
'lines.markersize': 11,
'patch.linewidth': 0.5,
'xtick.labelsize': 16,
'xtick.major.pad': 20,
'xtick.major.width': 2,
'xtick.minor.width': 1,
'ytick.labelsize': 16.0,
'ytick.major.pad': 20,
'ytick.major.width': 2,
'ytick.minor.width': 1 }
wide_c = dict(c, **{'figure.figsize':[20,8]})

In [152]:
def table(df,replace_match="",replace_str=""):
    return IPython.display.display(HTML(df.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped table-hover">').replace(replace_match,replace_str)))


## Data - Getting Global Cities

In [192]:
DATA_DIR = '../data/cities/'
CITIES_FILE = DATA_DIR + 'google_global_cities.csv'

df = pd.read_csv(CITIES_FILE)

df.describe()

Unnamed: 0,CriteriaID,ParentID
count,86460.0,86225.0
mean,6163814.708027,199841.391453
std,3862852.852441,1260101.607985
min,2004.0,2016.0
25%,1022781.75,20417.0
50%,9012996.5,21148.0
75%,9040921.25,21167.0
max,9062587.0,9062587.0


In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86460 entries, 0 to 86459
Data columns (total 7 columns):
CriteriaID       86460 non-null int64
Name             86460 non-null object
CanonicalName    86460 non-null object
ParentID         86225 non-null float64
CountryCode      86459 non-null object
TargetType       86460 non-null object
Status           86460 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 5.3+ MB


NB - Pandas doesn't construct columns if there is a space in the name

In [194]:
pd.unique(df.TargetType.ravel())

array(['City', 'Municipality', 'Neighborhood', 'County', 'Borough',
       'University', 'Airport', 'Province', 'State', 'Country',
       'DMA Region', 'Territory', 'Canton', 'Region',
       'Autonomous Community', 'Union Territory', 'Prefecture',
       'Governorate', 'Postal Code', 'Congressional District',
       'Department', 'TV Region', 'Okrug', 'City Region', 'District'], dtype=object)

In [221]:
pd.unique(df.CountryCode.ravel())

array(['AE', 'AR', 'AU', 'AT', 'BE', 'BG', 'BR', 'CA', 'CH', 'CL', 'CN',
       'CO', 'CZ', 'DE', 'DK', 'EG', 'ES', 'FI', 'FR', 'GB', 'HR', 'HU',
       'ID', 'IN', 'IE', 'IL', 'IT', 'JP', 'KR', 'MA', 'MX', 'MY', 'NG',
       'NL', 'NO', 'NZ', 'PK', 'PH', 'PL', 'PT', 'RO', 'RU', 'SA', 'SE',
       'TH', 'TR', 'TW', 'UA', 'US', 'VN', 'ZA', 'EE', 'GR', 'LT', 'LV',
       'PR', 'SI', 'SK'], dtype=object)

In [None]:
df = df[df.TargetType.isin( ['City'] )]

In [222]:
df = df[df.CountryCode.isin( ['GB'] )]

In [242]:
df2 = df.ix[sample(df.index,10)]
df2

Unnamed: 0,CriteriaID,Name,CanonicalName,ParentID,CountryCode,TargetType,Status,FSqName
5361,1006645,Coleford,"Coleford,England,United Kingdom",20339,GB,City,Active,"Coleford, England, United Kingdom"
5543,1006827,Irlam,"Irlam,England,United Kingdom",20339,GB,City,Active,"Irlam, England, United Kingdom"
5754,1007039,Rustington,"Rustington,England,United Kingdom",20339,GB,City,Active,"Rustington, England, United Kingdom"
5281,1006565,Brighton,"Brighton,England,United Kingdom",20339,GB,City,Active,"Brighton, England, United Kingdom"
5671,1006955,North Shields,"North Shields,England,United Kingdom",20339,GB,City,Active,"North Shields, England, United Kingdom"
5831,1007117,Stockley,"Stockley,England,United Kingdom",20339,GB,City,Active,"Stockley, England, United Kingdom"
5932,1007218,Westhoughton,"Westhoughton,England,United Kingdom",20339,GB,City,Active,"Westhoughton, England, United Kingdom"
5883,1007169,Titchfield,"Titchfield,England,United Kingdom",20339,GB,City,Active,"Titchfield, England, United Kingdom"
5453,1006737,Geddington,"Geddington,England,United Kingdom",20339,GB,City,Active,"Geddington, England, United Kingdom"
5674,1006958,Northallerton,"Northallerton,England,United Kingdom",20339,GB,City,Active,"Northallerton, England, United Kingdom"


In [238]:
df['FSqName'] = df.CanonicalName.str.replace(',', ', ')

In [246]:
len(df)

979

##Data - Query Foursquare

In [116]:
CLIENT_ID = 'ILD0TB12XNHPLVGIFY5O3RCEQG30D0JRHZX0PANKWMHKC2WS'
CLIENT_SECRET = 'VFDNL2QOVEWZG5TN51TPEOTNJ0BI0ZD351B4FWGE2ARCYYMD'
VERSION = '20140801'

In [259]:
import urllib2
import json

foursq_url_name = 'https://api.foursquare.com/v2/venues/explore?client_id=%s&client_secret=%s&v=%s&near=%s'

foursq_url_ll = 'https://api.foursquare.com/v2/venues/explore?client_id=%s&client_secret=%s&v=%s&ll=%s'


def getVenuesByName(near):
    url = foursq_url_name % (CLIENT_ID, CLIENT_SECRET, VERSION, near)  
    venueCategoryList = []    
    
    try:
        response = urllib2.urlopen(url)
        data = json.load(response)
    
        items = data['response']['groups'][0]['items']

        for item in items:
            name = item['venue']['name']
            rating = item['venue']['rating']
            for category in item['venue']['categories']:
                if (category['primary'] == True):
                    venueCategoryList.append(category['name'])
    except:
        return venueCategoryList            

    else:
        return venueCategoryList            



In [270]:
df2 = df.ix[sample(df.index,5)]
df2

for row in df2.values:
    city = row[7]
    city_features = getVenuesByName(city)
    
    print city + " -> "
    print city_features

Newcastle, Northern Ireland, United Kingdom -> 
[u'Caf\xe9', u'Caf\xe9', u'Beer Garden', u'Caf\xe9', u'Park', u'Caf\xe9', u'Theater', u'Spanish Restaurant', u'Bar', u'Coffee Shop', u'Wine Bar', u'Asian Restaurant', u'Caf\xe9', u'Asian Restaurant', u'Beach', u'Museum', u'Ice Cream Shop', u'Pizza Place', u'Caf\xe9', u'Caf\xe9', u'Caf\xe9', u'Bar', u'Caf\xe9', u'Bar', u'Caf\xe9', u'Flea Market', u'Caf\xe9', u'Restaurant', u'Bar', u'Caf\xe9']
Shepperton, England, United Kingdom -> 
[u'Garden Center', u'Performing Arts Venue', u'Pub', u'Coffee Shop', u'Indie Movie Theater', u'Pub', u'Pub', u'Italian Restaurant', u'Pub', u'Gastropub', u'Pub', u'Theme Park', u'Theme Park Ride / Attraction', u'Coffee Shop', u'Theme Park Ride / Attraction', u'Asian Restaurant', u'Theme Park Ride / Attraction', u'Theme Park Ride / Attraction', u'Theme Park Ride / Attraction', u'Japanese Restaurant', u'Pub', u'Coffee Shop', u'Italian Restaurant', u'Museum', u'Portuguese Restaurant', u'Racetrack', u'Breakfast Spot

In [260]:

a = getVenuesByName('Brighton, England, United Kingdom')
b = getVenuesByName('Chicago, IL, United States')



In [261]:

print a

print "--"

print b


[u'Sushi Restaurant', u'American Restaurant', u'Multiplex', u'Pub', u'Italian Restaurant', u'Coffee Shop', u'Bakery', u'American Restaurant', u'American Restaurant', u'New American Restaurant', u'Brewery', u'Sandwich Place', u'Dive Bar', u'Department Store', u'Mediterranean Restaurant', u'Steakhouse', u'Pet Store', u'Department Store', u'Japanese Restaurant', u'Dessert Shop', u'Greek Restaurant', u'Breakfast Spot', u'Burger Joint', u'Coffee Shop', u'Breakfast Spot', u'Mexican Restaurant', u'Burger Joint', u'Fast Food Restaurant', u'American Restaurant', u'Supermarket']
--
[u'Park', u'Theater', u'Art Museum', u'Park', u'Farmers Market', u'Public Art', u'Comedy Club', u'Coffee Shop', u'Caf\xe9', u'Brewery', u'Trail', u'Concert Hall', u'New American Restaurant', u'Science Museum', u'Bar', u'Seafood Restaurant', u'Mexican Restaurant', u'Pizza Place', u'Brewery', u'Indie Movie Theater', u'Pie Shop', u'New American Restaurant', u'Stadium', u'Cuban Restaurant', u'Tapas Restaurant', u'Gastropu

[u'Concert Hall', u'Baseball Stadium', u'Park', u'Botanical Garden', u'Coffee Shop', u'Performing Arts Venue', u'Plaza', u'Pub', u'Hockey Arena', u'Bar', u'Gastropub', u'Ice Cream Shop', u'Brewery', u'History Museum', u'Indie Movie Theater', u'Mexican Restaurant', u'Burger Joint', u'Zoo', u'Caribbean Restaurant', u'History Museum', u'Museum', u'Hot Dog Joint', u'Diner', u'Distillery', u'Park', u'Science Museum', u'Breakfast Spot', u'Diner', u'German Restaurant', u'Coffee Shop']
--
[u'Park', u'Theater', u'Art Museum', u'Park', u'Farmers Market', u'Public Art', u'Comedy Club', u'Coffee Shop', u'Caf\xe9', u'Brewery', u'Trail', u'Concert Hall', u'New American Restaurant', u'Science Museum', u'Bar', u'Seafood Restaurant', u'Mexican Restaurant', u'Pizza Place', u'Brewery', u'Indie Movie Theater', u'Pie Shop', u'New American Restaurant', u'Stadium', u'Cuban Restaurant', u'Tapas Restaurant', u'Gastropub', u'Grocery Store', u'Korean Restaurant', u'Seafood Restaurant', u'Coffee Shop']

In [148]:
import numpy as np
from sklearn.metrics import jaccard_similarity_score

print jaccard_similarity_score(a, b)

0.0
