# Entity Resolution
## Data Loading and Cleanup

In [1]:
import json
import pandas as pd
import math
import numpy as np
from difflib import SequenceMatcher

PATH = "Prakhar/er-assignment/fs/Instabase%20Drive/files/datasets/"
FILES = {
    "foursquare_test": "foursquare_test_hard.json",
    "locu_test": "locu_test_hard.json",
    "matches": "matches_train_hard.csv",
    "foursquare_train": "foursquare_train_hard.json",
    "locu_train": "locu_train_hard.json"
}

In [2]:
fs_train = pd.read_json(ib.open(PATH + FILES["foursquare_train"]))
fs_test = pd.read_json(ib.open(PATH + FILES["foursquare_test"]))
lc_train = pd.read_json(ib.open(PATH + FILES["locu_train"]))
lc_test = pd.read_json(ib.open(PATH + FILES["locu_test"]))
matches = pd.read_csv(ib.open(PATH + FILES["matches"]))

data_list = [fs_train, fs_test, lc_train, lc_test]

for df in data_list:
    df.drop(['country', 'region', 'locality'], inplace=True, axis=1)
    
    df.replace([''], [None], inplace=True)
    
    df['id'] = df['id'].astype('str')
    df['latitude'] = pd.to_numeric(df['latitude'])
    df['longitude'] = pd.to_numeric(df['longitude'])
#     df['locality'] = df['locality'].astype('str')
    
    # Unicode chars to replace
    df['name'].replace([u"\xe9"], ['e'], regex=True, inplace=True)
    df['name'].replace([u"\xed"], ['i'], regex=True, inplace=True)
    df['name'].replace([u'\u2019'], [''], regex=True, inplace=True)
    df['name'].replace([u'\xc7'], ['c'], regex=True, inplace=True)
    df['name'].replace([u'\u2013'], ['-'], regex=True, inplace=True)
    
    df['name'].replace([r':|\'|,|\.|-'], [''], regex=True, inplace=True)
    df['name'].replace(['&'], ['and'], regex=True, inplace=True)
    df['name'].replace(['\s+|\/'], [' '], regex=True, inplace=True)

    df['name'] = df['name'].astype(str).str.lower()
    
    df['phone'].replace([r'\(|\)|\s|-'], [''], regex=True, inplace=True)
    
    df['street_address'].replace([r'<sup>|<\/sup>'], [''], regex=True, inplace=True)
    df['street_address'] = df['street_address'].astype(str)
    
    df['website'].replace([u"\u200e"], [''], regex=True, inplace=True)
    df['website'].replace([r'http(s)?://(www.)?|\\u200e'], [''], regex=True, inplace=True)
    df['website'].replace([r'\..*'], [''], regex=True, inplace=True)
    df['website'] = df['website'].astype(str).str.lower()
    df['website'].replace(['None'], [None], inplace=True)

Append a prefix to identify the columns when concatenated:

In [3]:
for df in [fs_train, fs_test]:
    df.columns = ['fs_' + str(i) for i in list(df.columns)]
for df in [lc_train, lc_test]:
    df.columns = ['lc_' + str(i) for i in list(df.columns)]

## Create Row Combinations
LC data is repeated row at a time, then FS data is repeated entirely at a time. The two are concatenated to create the combo data frame.

In [4]:
train_left =  lc_train.loc[np.repeat(lc_train.index.values, len(lc_train))].reset_index(drop=True)
train_right =  pd.concat([fs_train]*len(fs_train), ignore_index=True)
train = pd.concat([train_left, train_right], axis=1)

test_left =  lc_test.loc[np.repeat(lc_test.index.values, len(lc_test))].reset_index(drop=True)
test_right =  pd.concat([fs_test]*len(fs_test), ignore_index=True)
test = pd.concat([test_left, test_right], axis=1)

## Similarity Functions

In [5]:
def find_distance(pt1, pt2):
    return math.sqrt( (pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2 )
    
def string_similarity(str1, str2):
  return SequenceMatcher(None, str1, str2).ratio()

## Computed Columns
### Match status

In [6]:
# Match dictionary
match_dict = {}
for i, row in matches.iterrows():
    match_dict[row['locu_id']] = row['foursquare_id']

In [7]:
match_column = []
for i, row in train.iterrows():
    lc_id = row['lc_id']
    fs_id = row['fs_id']
    if (lc_id in match_dict) and (match_dict[lc_id] == fs_id):
        match_column.append(1)
    else:
        match_column.append(0)
match_column = np.array(match_column)

### Various Distances

In [8]:
data_list = [train, test]

for d_i, df in enumerate(data_list):
    print("#####\nStarting iteration #{}".format(d_i))
    
    print('Processing distances...')
    distance = []
    for i, row in df.iterrows():
        lc_loc = (row['lc_latitude'], row['lc_longitude'])
        fs_loc = (row['fs_latitude'], row['fs_longitude'])
        distance.append(find_distance(lc_loc, fs_loc))

    print('Processing names...')
    name_dist = []
    for i, row in df.iterrows():
        lc_name = row['lc_name']
        fs_name = row['fs_name']
        name_dist.append(string_similarity(lc_name, fs_name))

    print('Processing ZIP codes...')
    zip_dist = []
    for i, row in df.iterrows():
        lc_zip = row['lc_postal_code']
        fs_zip = row['fs_postal_code']
        if lc_zip and fs_zip:
            zip_dist.append(string_similarity(lc_zip, fs_zip))
        else:
            zip_dist.append(0)

    print('Processing phone numbers...')
    phone_dist = []
    for i, row in df.iterrows():
        lc_phone = row['lc_phone']
        fs_phone = row['fs_phone']
        if lc_phone and fs_phone:
            phone_dist.append(string_similarity(lc_phone, fs_phone))
        else:
            phone_dist.append(0)

    print('Processing URLs...')
    url_dist = []
    for i, row in df.iterrows():
        lc_url = row['lc_website']
        fs_url = row['fs_website']
        if lc_url and fs_url:
            url_dist.append(string_similarity(lc_url, fs_url))
        else:
            url_dist.append(0)
    
    d = {'distance': distance,
         'name_dist': name_dist,
         'zip_dist': zip_dist,
         'phone_dist': phone_dist,
         'url_dist': url_dist }
    
    if d_i == 0:
        train_data = pd.DataFrame(d).fillna(0)
    else:
        test_data = pd.DataFrame(d).fillna(0)

#####
Starting iteration #0
Processing distances...
Processing names...
Processing ZIP codes...
Processing phone numbers...
Processing URLs...
#####
Starting iteration #1
Processing distances...
Processing names...
Processing ZIP codes...
Processing phone numbers...
Processing URLs...


## Model Training
* To-do: Discard all the obviously wrong rows

### Classifiers

In [58]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier

def cv_run_ada(train_data, train_labels, test_data, test_labels):
    model = AdaBoostClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)
    print(sum(preds))
    acc = sum(preds == test_labels)/float(len(test_labels))
    return acc

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    print(sum(preds))
    acc = sum(preds == test_labels)/float(len(test_labels))
    return acc

### Cross-validation

In [59]:
skf = StratifiedKFold(n_folds=3, random_state=1, shuffle=True)

for train_index, test_index in skf.split(train_data, match_column):
    cv_train_data = train_data.loc[train_index]
    cv_train_labels = match_column[train_index]
    cv_test_data = train_data.loc[test_index]
    cv_test_labels = match_column[test_index]
    
    fold_acc = cv_run_ada(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(fold_acc)

119
0.999958333333
115
0.999958333333
113
0.999925


## Preliminary testing only using distances

In [28]:
fs_train_dict, lc_train_dict, fs_test_dict, lc_test_dict = {}, {}, {}, {}
for i, row in fs_train.iterrows():
    fs_train_dict[row['id']] = (row['latitude'], row['longitude'])
for i, row in lc_train.iterrows():
    lc_train_dict[row['id']] = (row['latitude'], row['longitude'])
for i, row in fs_test.iterrows():
    fs_test_dict[row['id']] = (row['latitude'], row['longitude'])
for i, row in lc_test.iterrows():
    lc_test_dict[row['id']] = (row['latitude'], row['longitude'])
    
matches_pred = {}
for lc_key, lc_loc in lc_train_dict.iteritems():
    min_dist = float("inf")
    min_loc = None
    for fs_key, fs_loc in fs_train_dict.iteritems():
        dist = find_distance(lc_loc, fs_loc)
        if dist < min_dist:
            min_dist = dist
            min_key = fs_key
    matches_pred[lc_key] = min_key
    
count = 0
for _, row in matches.iterrows():
    lc_id = row['locu_id']
    fs_id = row['foursquare_id']
    if matches_pred[lc_id] == fs_id:
        count += 1
print(count / float(len(matches)))

KeyError: 'latitude'

In [16]:
matches_pred_test = {}
for lc_key, lc_loc in lc_test_dict.iteritems():
    min_dist = float("inf")
    min_loc = None
    for fs_key, fs_loc in fs_test_dict.iteritems():
        dist = find_distance(lc_loc, fs_loc)
        if dist < min_dist:
            min_dist = dist
            min_key = fs_key
    matches_pred_test[lc_key] = min_key

In [17]:
import csv

username = "woojink"
repo = "best-entity-resolvers"
f = ib.open('/{0}/{1}/fs/Instabase%20Drive/files/matches.csv'.format(username,repo))

# with open('output.csv', 'w') as csvfile:
writer = csv.writer(f)

header = ['locu_id', 'foursquare_id']
writer.writerow(header)
for key, val in matches_pred_test.iteritems():
    writer.writerow([key, val])
    
f.close()