# Get Food Safety Authority Data

In [1]:
import pandas as pd
from urllib.request import urlopen
import xml.etree.ElementTree as ET
from collections import OrderedDict
from io import StringIO

def download_foodsafety_data_and_convert_to_csv(xml_id='FHRS776en-GB.xml', out_fpath='food_safety.csv'):
    """
    Downloads food safety data given the local authority id which can be 
    found at http://ratings.food.gov.uk/open-data/en-GB 
    """
    url = 'http://ratings.food.gov.uk/OpenDataFiles/{}'.format(xml_id)
    response = urlopen(url)
    content = StringIO(response.read().decode())

    def get_info_from_xml():
        etree = ET.parse(content)
        root = etree.getroot()
        for enstab in root.iter('EstablishmentDetail'):
            d = OrderedDict()
            for item in enstab:
                if item.tag == 'Geocode':
                    for ll_info in item:
                        d[ll_info.tag] = float(ll_info.text)
                else:
                    d[item.tag] = item.text
            yield d

    df = pd.DataFrame(list(get_info_from_xml()))
    df.to_csv(out_fpath, index=None)

In [2]:
download_foodsafety_data_and_convert_to_csv(xml_id='FHRS776en-GB.xml', out_fpath='food_safety_Glasgow.csv')

# Get Yelp Data

To get a Yelp API KEY follow [these info](https://www.yelp.com/developers/documentation/v3/authentication). 
Then create a file called api_key.py with *YELP_API_KEY=<your yelp api key\>*

In [5]:
from api_key import YELP_API_KEY

In [6]:
import pandas as pd
import requests
from urllib.parse import quote
import json

class Client():
    endpoint = 'https://api.yelp.com/v3/businesses/search?'
    valid_seach_keys = ['term','location','latitude','longitude','radius','categories','locale','limit','offset','sort_by','price','open_now','open_at','attributes']

    def __init__(self, api_key):
        self.api_key = api_key
    
    def request(self, endpoint):
        #print(f'GET requesting: {endpoint}')
        headers = {"Authorization":f"Bearer {self.api_key}"}
        r = requests.get(endpoint, headers=headers)
        return r
        
    def search_businesses(self, **kwargs):
        verbose = kwargs.pop('verbose', False)
        for k in kwargs.keys():
            assert k in self.valid_seach_keys
        search_values = [f'{k}={quote(str(v))}' for k,v in kwargs.items()]
        url = self.endpoint + '&'.join(search_values)
        if verbose: print('Sending request:', url)
        r = self.request(url)
        return r
    
def save_to_json(obj, fpath='yelp.json'):
    with open(fpath, 'w') as f:
        json.dump(obj, f)

def load_from_json(fpath='yelp.json'):
    with open(fpath, 'r') as f:
        content = json.load(f)
    return content

## Downlaoding Yelp Data

In [7]:
c = Client(api_key=YELP_API_KEY)

In [8]:
latitude, longitude = 55.863937, -4.270185  # M8 bridge
radius = 5000
categories = 'restaurants'
limit = 50  # max=50

In [9]:
offset = 0
last_search_results = limit
MAX_SIZE = 1000
all_businesses = []

while last_search_results >= limit and offset < MAX_SIZE:
    r = c.search_businesses(
        latitude=latitude, 
        longitude=longitude, 
        radius=radius, 
        categories=categories, 
        offset=offset, 
        limit=limit
    )
    if 'error' in r.json():
        print(r.json())
        break
    
    found_businesses = r.json()['businesses']
    last_search_results = len(found_businesses)
    offset += last_search_results
    
    all_businesses.extend(found_businesses)
    print(f'added: {last_search_results}, curr len={len(all_businesses)}')

save_to_json(obj=all_businesses, fpath='yelp.json')

added: 50, curr len=50
added: 50, curr len=100


# Convert json to table

In [10]:
def convert_item_to_record(b):
    EXCLUDE = ['display_address']
    record = {}
    for k,v in b.items():
        if k in EXCLUDE: continue
        if isinstance(v, (int, float, bool, str)):
            record[k] = v
        elif isinstance(v, dict):
            sub_record = convert_item_to_record(v)
            record.update(sub_record)
    return record

In [11]:
all_businesses = load_from_json(fpath='yelp.json')

In [12]:
records = [convert_item_to_record(b) for b in all_businesses]
df = pd.DataFrame(records)

ordered_col = [
    'name', 'rating', 'review_count',  'price',
    'latitude', 'longitude', 'zip_code', 
    'id', 'alias',
    'address1', 'address2', 'address3',  'city', 'state', 'country',
    'display_phone', 'image_url', 'phone', 'url'
]

In [13]:
df[ordered_col].to_csv('yelp.csv', index=None)

## Matching information

In [14]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

In [15]:
df_fsa = pd.read_csv('food_safety_Glasgow.csv')
df_yelp = pd.read_csv('yelp.csv')

In [16]:
df_yelp = df_yelp[['name', 'latitude', 'longitude', 'zip_code', 'id', 'address1']].copy()
df_fsa = df_fsa[['FHRSID', 'BusinessName', 'AddressLine2', 'PostCode', 'Longitude', 'Latitude']].copy()

df_yelp = df_yelp[pd.notnull(df_yelp.latitude) & pd.notnull(df_yelp['name'])]
df_fsa = df_fsa[pd.notnull(df_fsa.Latitude) & pd.notnull(df_fsa['BusinessName'])]

In [17]:
def get_distance_score(lat, lon, lat_ref, lon_ref):
    lat_diff = lat - lat_ref
    lon_diff = lon - lon_ref
    pseudo_dist = np.hypot(lat_diff, lon_diff)
    return np.exp(-pseudo_dist*100)

@np.vectorize
def get_str_compare_score(addr, addr_ref):
    score = SequenceMatcher(None, addr, addr_ref).ratio()
    return score

In [18]:
DIST_SCORE_THREASHOLD = 0.8
NAME_SCORE_THREASHOLD = 0.5

MATCHES = {}
def find_matches(df):
    for i_counter, (idx, item) in enumerate(df.iterrows()):
        search_df = df_fsa.copy()

        # filter by distance
        # ==================
        score_dist = get_distance_score(
            lat=search_df.Latitude, lon=search_df.Longitude, 
            lat_ref=item.latitude, lon_ref=item.longitude
        )
        search_df = search_df[score_dist >= DIST_SCORE_THREASHOLD]
        if search_df.empty: continue

        # score by name
        # =============
        search_df['scores'] = get_str_compare_score(addr=search_df.BusinessName, addr_ref=item['name'])
        search_df = search_df[search_df['scores'] >= NAME_SCORE_THREASHOLD]
        MATCHES[item['id']] = search_df[['FHRSID','scores']].set_index('FHRSID')['scores'].to_dict()

        # get best match
        # ==============
        if search_df.empty:
            idx_best_match = score_best_match = best_matching_FHRSID = np.nan
        else:
            idx_best_match = search_df['scores'].idxmax()
            score_best_match = search_df.loc[idx_best_match].scores
            best_matching_FHRSID = search_df.loc[idx_best_match].FHRSID
        print(f'\rMatched {i_counter}/{df.shape[0]}: {best_matching_FHRSID}, score: {score_best_match}', end='')
    
    return MATCHES

In [19]:
matches_dict = find_matches(df=df_yelp)

Matched 99/100: 79746, score: 1.0

In [20]:
# creating match matrix
match_matrix = pd.DataFrame(matches_dict)
match_matrix.index.name = 'FHRSID'
match_matrix.columns.name = 'yelp_id'
match_matrix.head()

# combining reciprocal maximums scoring matches only
col_max = match_matrix.idxmax(axis=1)
row_max = match_matrix.idxmax(axis=0)
col_max.name = 'yelp_id'
row_max.name = 'FHRSID'
d1 = col_max.reset_index()
d2 = row_max.reset_index()
df_matches = pd.merge(d1, d2, on=['FHRSID', 'yelp_id'], how='inner')

In [21]:
df_matches.to_csv('matches.csv')

In [22]:
# adding FHRSID to Yelp dataset via inner join
df_yelp_full = pd.read_csv('yelp.csv')
df_agumented = pd.merge(left=df_yelp_full, right=df_matches, left_on='id', right_on='yelp_id', how='inner')

In [23]:
df_agumented[['name', 'rating', 'review_count', 'price', 'latitude', 'longitude', 'zip_code', 'id', 'address1', 'FHRSID']]\
    .to_csv('yelp_matched.csv', index=0)