# Entity Resolution
## Data Loading and Cleanup

In [1]:
import json
import pandas as pd
import math
import numpy as np
from difflib import SequenceMatcher

PATH = "Prakhar/er-assignment/fs/Instabase%20Drive/files/datasets/"
FILES = {
    "foursquare_test": "foursquare_test_hard.json",
    "locu_test": "locu_test_hard.json",
    "matches": "matches_train_hard.csv",
    "foursquare_train": "foursquare_train_hard.json",
    "locu_train": "locu_train_hard.json"
}

In [31]:
fs_train = pd.read_json(ib.open(PATH + FILES["foursquare_train"]))
fs_test = pd.read_json(ib.open(PATH + FILES["foursquare_test"]))
lc_train = pd.read_json(ib.open(PATH + FILES["locu_train"]))
lc_test = pd.read_json(ib.open(PATH + FILES["locu_test"]))
matches = pd.read_csv(ib.open(PATH + FILES["matches"]))

data_list = [fs_train, fs_test, lc_train, lc_test]

for df in data_list:
    df.drop(['country', 'region', 'locality'], inplace=True, axis=1)
    
    df.replace([''], [None], inplace=True)
    
    df['id'] = df['id'].astype('str')
    df['latitude'] = pd.to_numeric(df['latitude'])
    df['longitude'] = pd.to_numeric(df['longitude'])
#     df['locality'] = df['locality'].astype('str')
    
    # Unicode chars to replace
    df['name'].replace([u"\xe9"], ['e'], regex=True, inplace=True)
    df['name'].replace([u"\xed"], ['i'], regex=True, inplace=True)
    df['name'].replace([u'\u2019'], [''], regex=True, inplace=True)
    df['name'].replace([u'\xc7'], ['c'], regex=True, inplace=True)
    df['name'].replace([u'\u2013'], ['-'], regex=True, inplace=True)
    
    df['name'].replace([r':|\'|,|\.|-'], [''], regex=True, inplace=True)
    df['name'].replace(['&'], ['and'], regex=True, inplace=True)
    df['name'].replace(['\s+|\/'], [' '], regex=True, inplace=True)

    df['name'] = df['name'].astype(str).str.lower()
    
    df['phone'].replace([r'\(|\)|\s|-'], [''], regex=True, inplace=True)
    
    df['street_address'].replace([r'<sup>|<\/sup>'], [''], regex=True, inplace=True)
    df['street_address'] = df['street_address'].astype(str)
    
    df['website'].replace([u"\u200e"], [''], regex=True, inplace=True)
    df['website'].replace([r'http(s)?://(www.)?|\\u200e'], [''], regex=True, inplace=True)
    df['website'].replace([r'\..*'], [''], regex=True, inplace=True)
    df['website'] = df['website'].astype(str).str.lower()
    df['website'].replace(['None'], [None], inplace=True)

Append a prefix to identify the columns when concatenated:

In [32]:
for df in [fs_train, fs_test]:
    df.columns = ['fs_' + str(i) for i in list(df.columns)]
for df in [lc_train, lc_test]:
    df.columns = ['lc_' + str(i) for i in list(df.columns)]

## Create Row Combinations
LC data is repeated row at a time, then FS data is repeated entirely at a time. The two are concatenated to create the combo data frame.

In [47]:
train_left =  lc_train.loc[np.repeat(lc_train.index.values, len(lc_train))].reset_index(drop=True)
train_right =  pd.concat([fs_train]*len(fs_train), ignore_index=True)
train = pd.concat([train_left, train_right], axis=1)

test_left =  lc_test.loc[np.repeat(lc_test.index.values, len(lc_test))].reset_index(drop=True)
test_right =  pd.concat([fs_test]*len(fs_test), ignore_index=True)
test = pd.concat([test_left, test_right], axis=1)

## Similarity Functions

In [52]:
def find_distance(pt1, pt2):
    return math.sqrt( (pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2 )
    
def string_similarity(str1, str2):
  return SequenceMatcher(None, str1, str2).ratio()

## Preliminary testing only using distances

In [273]:
fs_train_dict, lc_train_dict, fs_test_dict, lc_test_dict = {}, {}, {}, {}
for i, row in fs_train.iterrows():
    fs_train_dict[row['id']] = (row['latitude'], row['longitude'])
for i, row in lc_train.iterrows():
    lc_train_dict[row['id']] = (row['latitude'], row['longitude'])
for i, row in fs_test.iterrows():
    fs_test_dict[row['id']] = (row['latitude'], row['longitude'])
for i, row in lc_test.iterrows():
    lc_test_dict[row['id']] = (row['latitude'], row['longitude'])
    
matches_pred = {}
for lc_key, lc_loc in lc_train_dict.iteritems():
    min_dist = float("inf")
    min_loc = None
    for fs_key, fs_loc in fs_train_dict.iteritems():
        dist = find_distance(lc_loc, fs_loc)
        if dist < min_dist:
            min_dist = dist
            min_key = fs_key
    matches_pred[lc_key] = min_key
    
count = 0
for _, row in matches.iterrows():
    lc_id = row['locu_id']
    fs_id = row['foursquare_id']
    if matches_pred[lc_id] == fs_id:
        count += 1
print(count / float(len(matches)))

In [287]:
matches_pred_test = {}
for lc_key, lc_loc in lc_test_dict.iteritems():
    min_dist = float("inf")
    min_loc = None
    for fs_key, fs_loc in fs_test_dict.iteritems():
        dist = find_distance(lc_loc, fs_loc)
        if dist < min_dist:
            min_dist = dist
            min_key = fs_key
    matches_pred_test[lc_key] = min_key

In [288]:
import csv

username = "woojink"
repo = "best-entity-resolvers"
f = ib.open('/{0}/{1}/fs/Instabase%20Drive/files/matches.csv'.format(username,repo))

# with open('output.csv', 'w') as csvfile:
writer = csv.writer(f)

header = ['locu_id', 'foursquare_id']
writer.writerow(header)
for key, val in matches_pred_test.iteritems():
    writer.writerow([key, val])
    
f.close()