## The Levenshtein Distance (edit distance)

\begin{equation*}
    lev_{a,b}(i,j)=
    \begin{cases} max(i,j) & \text{if $min(i,j)=0$} \\
    min
    \begin{cases} 
        lev_{a_{\prime}b}(i-1,j)+1 \\
        lev_{a_{\prime}b}(i,j-1)+1 \\
        lev_{a_{\prime}b}(i-1,j-1)+\mathbf{1}_{(a_{i} \neq b_{j})}
    \end{cases} & \text{otherwise}
    \end{cases}
    \tag{1}
\end{equation*}

In [None]:
import pandas as pd
import numpy as np
import Levenshtein as lev

In [None]:
import os
import sys
sys.path.append(os.getcwd())

In [None]:
from sgPlaceAPI import lat_lon_pk, lookup_pk_sgname

In [None]:
csv_path = "./cultural_resource_dc_sg.csv" 
df = pd.read_csv(csv_path)
df.head()

In [None]:
s1 = df['sg_pk'].dropna().apply(lookup_pk_sgname).str.lower().rename('sg_name')

In [None]:
s2 = df.iloc[df['sg_pk'].dropna().index]['Item'].rename('cp_name')

In [None]:
df_new = s1.to_frame().join(s2)
print('{} out of {} records were identified in SafeGraph.'.format(len(s1), len(df)))
df_new.head()

### A function remove unwanted characters (articles, comma, symbol) and leading and following whitespaces

In [None]:
def removearticles(text):
    textwords = text.split('-')
    textwords = ' '.join(textwords)
    textwords = textwords.split()
    stopwords = ['a', 'an', 'the', 'and', '&']
    resultwords  = [word for word in textwords if word.lower() not in stopwords]
    result = ' '.join(resultwords)
    symbols = ['.', ',']
    for symbol in symbols:
        result = result.replace(symbol, '')
    return result

In [None]:
df_new['sg_name'] = df_new['sg_name'].apply(removearticles)

In [None]:
df_new['cp_name'] = df_new['cp_name'].apply(removearticles)

In [None]:
df_new[:30]

In [None]:
df_new['similarity'] = df_new.apply(lambda x: lev.ratio(x[0], x[1]), axis=1)

In [None]:
df_new[:30]

## Haversine distance

$$d=2r\arcsin\left(\sqrt{\sin^2\left(\frac{\phi_2-\phi_1}{2}\right)+\cos(\phi_1)\cos(\phi_2)\sin^2\left(\frac{\lambda_2-\lambda_1}{2}\right)}\right)$$

In [None]:
# retrieved from https://stackoverflow.com/a/29546836/4492663

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [None]:
haversine_np(38.883627, -76.983400, 38.889484, -77.035279)  # washington monument

In [None]:
df_new['distance'] = haversine_np(df.iloc[df['sg_pk'].dropna().index]['lat'],
                                  df.iloc[df['sg_pk'].dropna().index]['lon'],
                                  df.iloc[df['sg_pk'].dropna().index]['sg_lat'],
                                  df.iloc[df['sg_pk'].dropna().index]['sg_lon'])

In [None]:
df_new[:30]

In [None]:
df_new = df_new.join(df[['lat', 'lon']]).groupby(['lat', 'lon']).agg({'similarity': 'max',
                                                                      'distance': 'min',
                                                                      'cp_name': lambda x: ','.join(set(x)),
                                                                      'sg_name': lambda x: ','.join(set(x))}).reset_index()

In [None]:
csv_path = "./cultural_resource_dc_sg_match.csv" 
df_new.to_csv(csv_path, index=False)