# Get Food Safety Authority Data

In [None]:
import pandas as pd
from urllib.request import urlopen
import xml.etree.ElementTree as ET
from collections import OrderedDict
from io import StringIO

def download_foodsafety_data_and_convert_to_csv(xml_id='FHRS776en-GB.xml', out_fpath='food_safety.csv'):
    """
    Downloads food safety data given the local authority id which can be 
    found at http://ratings.food.gov.uk/open-data/en-GB 
    """
    url = 'http://ratings.food.gov.uk/OpenDataFiles/{}'.format(xml_id)
    response = urlopen(url)
    content = StringIO(response.read().decode())

    def get_info_from_xml():
        etree = ET.parse(content)
        root = etree.getroot()
        for enstab in root.iter('EstablishmentDetail'):
            d = OrderedDict()
            for item in enstab:
                if item.tag == 'Geocode':
                    for ll_info in item:
                        d[ll_info.tag] = float(ll_info.text)
                else:
                    d[item.tag] = item.text
            yield d

    df = pd.DataFrame(list(get_info_from_xml()))
    df.to_csv(out_fpath, index=None)

In [None]:
download_foodsafety_data_and_convert_to_csv(xml_id='FHRS776en-GB.xml', out_fpath='food_safety_Glasgow.csv')

# Get Yelp Data

To get a Yelp API KEY follow [these info](https://www.yelp.com/developers/documentation/v3/authentication). 
Then create a file called api_key.py with *YELP_API_KEY=<your yelp api key\>*

In [1]:
from api_key import YELP_API_KEY

In [44]:
import pandas as pd
import numpy as np
import requests
from urllib.parse import quote
import json

class Client():
    endpoint = 'https://api.yelp.com/v3/businesses/search?'
    valid_seach_keys = ['term','location','latitude','longitude','radius','categories','locale','limit','offset','sort_by','price','open_now','open_at','attributes']

    def __init__(self, api_key):
        self.api_key = api_key
    
    def request(self, endpoint):
        #print(f'GET requesting: {endpoint}')
        headers = {"Authorization":f"Bearer {self.api_key}"}
        r = requests.get(endpoint, headers=headers)
        return r
        
    def search_businesses(self, **kwargs):
        verbose = kwargs.pop('verbose', False)
        for k in kwargs.keys():
            assert k in self.valid_seach_keys
        search_values = [f'{k}={quote(str(v))}' for k,v in kwargs.items()]
        url = self.endpoint + '&'.join(search_values)
        if verbose: print('Sending request:', url)
        r = self.request(url)
        return r
    
def save_to_json(obj, fpath='yelp.json'):
    with open(fpath, 'w') as f:
        json.dump(obj, f)

def load_from_json(fpath='yelp.json'):
    with open(fpath, 'r') as f:
        content = json.load(f)
    return content


class SearchGrid:
    cos30 = np.cos(np.deg2rad(30))
    def __init__(self, ll_coord, ur_coord, step):
        self.minx, self.miny = ll_coord
        self.maxx, self.maxy = ur_coord
        self.step = step
        self.radius = self.step / 2 / self.cos30
        self.margin = self.step - self.radius
        
    def __len__(self):
        counter = 0
        with_x_offset = True
        y_offset = np.sqrt(np.square(self.radius)-np.square(self.step/2))
        for y in np.arange(self.miny+y_offset, self.maxy+self.step/2, self.step*self.cos30):
            with_x_offset = not with_x_offset
            offset = int(with_x_offset) * self.step/2
            for x in np.arange(self.minx+offset, self.maxx+self.step/2, self.step):
                counter += 1
        return counter
        
    def __iter__(self):
        with_x_offset = True
        y_offset = np.sqrt(np.square(self.radius)-np.square(self.step/2))
        for y in np.arange(self.miny+y_offset, self.maxy+self.step/2, self.step*self.cos30):
            with_x_offset = not with_x_offset
            offset = int(with_x_offset) * self.step/2
            for x in np.arange(self.minx+offset, self.maxx+self.step/2, self.step):
                yield x, y, self.radius

In [45]:
sg = SearchGrid(ll_coord=ll_coord, ur_coord=ur_coord, step=0.02)
len(sg)


162

In [None]:
# minx, miny = (0, 0)
# maxx, maxy = (10, 10)
# sg = SearchGrid(ll_coord=(minx, miny), ur_coord=(maxx, maxy), step=2)

# import matplotlib.pyplot as plt
# %matplotlib inline

# fig, ax = plt.subplots(1,1)
# for x, y, r in sg: 
#     xs = []
#     ys = []
#     for a in np.arange(0, 2*np.pi+0.1, 0.3):
#         xs.append(x+r*np.cos(a))
#         ys.append(y+r*np.sin(a))
#     ax.plot(xs, ys)

# ax.plot([minx,maxx, maxx, minx, minx],[miny, miny, maxy, maxy, miny], 'r--', lw=3)
# ax.grid(True)
# fig.set_size_inches(20,20)

In [3]:
c = Client(api_key=YELP_API_KEY)

In [11]:
def approx_radius_to_meters(r, latitude):
    """Converting (roughly) a change in degrees lat/long to meters"""
    r_lat = 110574 * r
    r_lon = 111320 * r * np.cos(np.deg2rad(latitude))
    return max(r_lat, r_lon)

approx_radius_to_meters(r=0.02, latitude=55.863937)

2211.48

In [46]:
# latitude, longitude = 55.863937, -4.270185  # M8 bridge
# radius = 5000

CATEGORIES = 'restaurants'
LIMIT = 50  # max=50
MAX_SIZE = 1000  # before yelp API complains

ll_coord = 55.76308604, -4.385425
ur_coord = 55.926417, -4.093184
step = 0.02

all_businesses = []
sg = SearchGrid(ll_coord=ll_coord, ur_coord=ur_coord, step=step)
len_sg = len(sg)
for i, (lat, lon, radius) in enumerate(sg):
    radius_meters = approx_radius_to_meters(radius, lat)
    
    print(f"\rCentre ({lat:3f}, {lon:.3f}), radius {radius_meters:.1f}m ({i+1}/{len_sg})")
    
    offset = 0
    last_search_results = LIMIT
    while last_search_results >= LIMIT and offset < MAX_SIZE:
        r = c.search_businesses(
            latitude=lat, 
            longitude=lon, 
            radius=int(radius_meters*1.05), 
            categories=CATEGORIES, 
            offset=offset, 
            limit=LIMIT,
        )
        if 'error' in r.json():
            print("Error!!")
            print(r.json())
            break

        found_businesses = r.json()['businesses']
        last_search_results = len(found_businesses)
        offset += last_search_results

        all_businesses.extend(found_businesses)
        print(f'added: {last_search_results}, curr len={len(all_businesses)}')

save_to_json(obj=all_businesses, fpath='yelp.json')

Centre (55.763086, -4.380), radius 1276.8m (1/162)
added: 0, curr len=0
Centre (55.783086, -4.380), radius 1276.8m (2/162)
added: 5, curr len=5
Centre (55.803086, -4.380), radius 1276.8m (3/162)
added: 5, curr len=10
Centre (55.823086, -4.380), radius 1276.8m (4/162)
added: 3, curr len=13
Centre (55.843086, -4.380), radius 1276.8m (5/162)
added: 1, curr len=14
Centre (55.863086, -4.380), radius 1276.8m (6/162)
added: 26, curr len=40
Centre (55.883086, -4.380), radius 1276.8m (7/162)
added: 28, curr len=68
Centre (55.903086, -4.380), radius 1276.8m (8/162)
added: 5, curr len=73
Centre (55.923086, -4.380), radius 1276.8m (9/162)
added: 3, curr len=76
Centre (55.773086, -4.362), radius 1276.8m (10/162)
added: 1, curr len=77
Centre (55.793086, -4.362), radius 1276.8m (11/162)
added: 3, curr len=80
Centre (55.813086, -4.362), radius 1276.8m (12/162)
added: 5, curr len=85
Centre (55.833086, -4.362), radius 1276.8m (13/162)
added: 9, curr len=94
Centre (55.853086, -4.362), radius 1276.8m (14/

# Convert json to table

In [47]:
def convert_item_to_record(b):
    EXCLUDE = ['display_address']
    record = {}
    for k,v in b.items():
        if k in EXCLUDE: continue
        if isinstance(v, (int, float, bool, str)):
            record[k] = v
        elif isinstance(v, dict):
            sub_record = convert_item_to_record(v)
            record.update(sub_record)
    return record

In [48]:
all_businesses = load_from_json(fpath='yelp.json')

In [49]:
records = [convert_item_to_record(b) for b in all_businesses]
df = pd.DataFrame(records)

ordered_col = [
    'name', 'rating', 'review_count',  'price',
    'latitude', 'longitude', 'zip_code', 
    'id', 'alias',
    'address1', 'address2', 'address3',  'city', 'state', 'country',
    'display_phone', 'image_url', 'phone', 'url'
]

In [58]:
df.drop_duplicates(subset='id')[ordered_col].to_csv('yelp.csv', index=None)

## Matching information

In [59]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

In [60]:
df_fsa = pd.read_csv('food_safety_Glasgow.csv')
df_yelp_full = pd.read_csv('yelp.csv')

In [61]:
df_yelp = df_yelp_full[['name', 'latitude', 'longitude', 'zip_code', 'id', 'address1']].copy()
df_fsa = df_fsa[['FHRSID', 'BusinessName', 'AddressLine2', 'PostCode', 'Longitude', 'Latitude']].copy()

df_yelp = df_yelp[pd.notnull(df_yelp.latitude) & pd.notnull(df_yelp['name'])]
df_fsa = df_fsa[pd.notnull(df_fsa.Latitude) & pd.notnull(df_fsa['BusinessName'])]

In [62]:
def get_distance_score(lat, lon, lat_ref, lon_ref):
    lat_diff = lat - lat_ref
    lon_diff = lon - lon_ref
    pseudo_dist = np.hypot(lat_diff, lon_diff)
    return np.exp(-pseudo_dist*100)

@np.vectorize
def get_str_compare_score(addr, addr_ref):
    score = SequenceMatcher(None, addr, addr_ref).ratio()
    return score

In [63]:
DIST_SCORE_THREASHOLD = 0.9
NAME_SCORE_THREASHOLD = 0.5

MATCHES = {}
def find_matches(df):
    for i_counter, (idx, item) in enumerate(df.iterrows()):
        search_df = df_fsa.copy()

        # filter by distance
        # ==================
        score_dist = get_distance_score(
            lat=search_df.Latitude, lon=search_df.Longitude, 
            lat_ref=item.latitude, lon_ref=item.longitude
        )
        search_df = search_df[score_dist >= DIST_SCORE_THREASHOLD]
        if search_df.empty: continue

        # score by name
        # =============
        search_df['scores'] = get_str_compare_score(addr=search_df.BusinessName, addr_ref=item['name'])
        search_df = search_df[search_df['scores'] >= NAME_SCORE_THREASHOLD]
        MATCHES[item['id']] = search_df[['FHRSID','scores']].set_index('FHRSID')['scores'].to_dict()

        # get best match
        # ==============
        if search_df.empty:
            idx_best_match = score_best_match = best_matching_FHRSID = np.nan
        else:
            idx_best_match = search_df['scores'].idxmax()
            score_best_match = search_df.loc[idx_best_match].scores
            best_matching_FHRSID = search_df.loc[idx_best_match].FHRSID
        print(f'\rMatched {i_counter}/{df.shape[0]}: {best_matching_FHRSID}, score: {score_best_match}', end='')
    
    return MATCHES

In [64]:
matches_dict = find_matches(df=df_yelp)

Matched 1534/1553: nan, score: nan

In [65]:
# creating match matrix
match_matrix = pd.DataFrame(matches_dict)
match_matrix.index.name = 'FHRSID'
match_matrix.columns.name = 'yelp_id'
match_matrix.head()

# combining reciprocal maximums scoring matches only
col_max = match_matrix.idxmax(axis=1)
row_max = match_matrix.idxmax(axis=0)
col_max.name = 'yelp_id'
row_max.name = 'FHRSID'
d1 = col_max.reset_index()
d2 = row_max.reset_index()
df_matches = pd.merge(d1, d2, on=['FHRSID', 'yelp_id'], how='inner')

In [66]:
# adding FHRSID to Yelp dataset via inner join
df_agumented = pd.merge(left=df_yelp_full, right=df_matches, left_on='id', right_on='yelp_id', how='inner')

In [67]:
df_agumented[['name', 'rating', 'review_count', 'price', 'latitude', 'longitude', 'zip_code', 'id', 'address1', 'FHRSID']]\
    .to_csv('yelp_matched.csv', index=0)