In [None]:
import pandas as pd
import re
import numpy as np
from haversine import haversine, Unit
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fastDamerauLevenshtein import damerauLevenshtein
import Levenshtein as lev
import jaro

In [None]:
osm_canada = pd.read_csv('canada-points-csv.csv',sep='\t')

In [None]:
osm_canada = osm_canada[osm_canada.name.notnull()==True]
osm_canada = osm_canada[osm_canada.highway.notnull()==False]
osm_canada = osm_canada[osm_canada.other_tags.notnull()==True]
osm_canada = osm_canada[osm_canada.other_tags.str.contains("amenity")]

In [None]:
osm_canada = osm_canada.drop(['barrier','highway','ref','address','is_in','place','man_made', 'osm_id'], axis=1)
osm_canada = osm_canada.rename(columns = {'X': 'longitude', 'Y': 'latitude'}, inplace = False)
osm_canada = osm_canada[osm_canada['other_tags'].str.contains('addr|name:en|cuisine')]
osm_canada.reset_index()
osm_canada = osm_canada.drop('other_tags', axis=1)
# makes new indices
osm_canada = osm_canada.reset_index()
# removes the original index (number of record in original dataset), so have it in mind
osm_canada=osm_canada.drop('index',axis=1)
osm_canada

In [None]:
yelp_canada = pd.read_json('/Users/admin/PycharmProjects/atlantbh_internship/Canada.json', lines=True)
# this is data that already only includes open businesses
yelp_canada = yelp_canada.drop(['business_id', 'hours', 'stars', 'review_count', 'is_open', 'attributes'], axis=1)

In [None]:
#yelp_500 = yelp_canada.sample(500)
yelp_500 = yelp_canada
yelp_500 = yelp_500.drop(['address', 'city', 'state', 'categories', 'postal_code'],axis=1)
#osm_500 = osm_canada.head(5000)
osm_500 = osm_canada

In [None]:
def clean(yelp, osm):
    yelp = yelp.lower()
    osm = osm.lower()
    matches_yelp = re.findall(r"the | the |restaurant|caffe|cafe|theatre|food|house|park|club", str(yelp))
    matches_yelp = [match.strip() for match in matches_yelp]
    matches_osm = re.findall(r"the | the |restaurant|caffe|cafe|theatre|food|house|park|club", str(osm))
    matches_osm = [match.strip() for match in matches_osm]
    matches = list(set(matches_yelp) & set(matches_osm))
    for match in matches:
        yelp = yelp.replace(match, '')
        osm = osm.replace(match, '')
    return yelp, osm

In [None]:
def score_it(yelp_coord, osm_coord, yelp_name, choices):
    
    dist = haversine(yelp_coord, osm_coord, unit='m') # calculate haversine distance between coordinates
    haver_score = int(100-dist) # since matches above 100m are not very appreciated, scale to 100 points

    if haver_score < 0: # if the distance is more than 100m, revert to 0 points
        haver_score = 0
        
    st1 = str(choices[0]); st1 = st1.lower()
    st2 = str(yelp_name); st2 = st2.lower()
    
    dam_lev_score = 0
    
    if fuzz.token_set_ratio(st1, st2) > 10:  
        yelp, osm = clean(yelp_name, choices[0])
        if len(yelp) != 0 and len(osm) != 0:
            dam_lev_score = damerauLevenshtein(yelp, osm)*100
        else:
            dam_lev_score = damerauLevenshtein(yelp_name, choices[0])*100
        if dam_lev_score > 40:
            dam_lev_score = damerauLevenshtein(yelp_name, choices[0])*100

    if haver_score < 1 or dam_lev_score < 40:
        score = 0
    else:
        score = round(haver_score*0.3 + dam_lev_score*0.7, 2) # scale everything back to 100 points
        
    return score, dist

In [None]:
def skor(yelp_row):
    
    true_score = 0
    rec = np.NaN
    amenity_cluster = list()
    yelp_coord = (yelp_row['latitude'], yelp_row['longitude'])
    amenity_cluster.append([yelp_row['name'], yelp_coord])
    
    for osm_row_index, osm_row in osm_500.iterrows():
        
        osm_coord = (osm_row['latitude'], osm_row['longitude'])
        
        if haversine(osm_coord, yelp_coord, unit='m') > 100:
            continue
        
        score, haver = score_it(yelp_coord, osm_coord, yelp_row['name'],
                         [osm_row['name'], osm_row['name_english'], osm_row['official_name']])
        if haver < 100:
            amenity_cluster.append([osm_row['name'], osm_coord, haver])
        if score > true_score:
            true_score = score
            rec = [osm_row['name'], osm_row['latitude'], osm_row['longitude']]

    been_there = False
    
    for neighbor in range(1, len(amenity_cluster)-1):
        for neighbor_2 in range(neighbor+1, len(amenity_cluster)):
            haver = haversine(amenity_cluster[neighbor][1], amenity_cluster[neighbor_2][1], unit='m')
            if amenity_cluster[neighbor][2] > haver:
                true_score -= round((amenity_cluster[neighbor][2] - haver)*0.1, 2)
                been_there = True
                break
            elif amenity_cluster[neighbor_2][2] > haver:
                true_score -= round((amenity_cluster[neighbor_2][2] - haver)*0.1, 2)
                been_there = True
                break
        if been_there:
            break
    
    if rec == 'nan':
        return np.NaN
    if true_score<40:
        grade = 'not matched'
    elif true_score<80:
        grade = 'partially matched'
    else:
        grade = 'matched'
            
    return grade + ": " + str(round(true_score, 2))+', '+str(rec)

In [None]:
yelp_500['verification_score'] = yelp_500.apply(lambda x : skor(x), axis=1)

In [None]:
yelp_500[yelp_500['verification_score'].str.contains(' nan')==False]