# Entity Resolution
Xavier Ignacio Gonzalez, Woojin Kim, Diego Miguel Llarrull

## Data loading and preprocessing

In [1]:
import csv
import json
import pandas as pd
import math
import numpy as np
import streetaddress as sa
from difflib import SequenceMatcher

PATH = "Prakhar/er-assignment/fs/Instabase%20Drive/files/datasets/"
FILES = {
    "foursquare_test": "foursquare_test_hard.json",
    "locu_test": "locu_test_hard.json",
    "matches": "matches_train_hard.csv",
    "foursquare_train": "foursquare_train_hard.json",
    "locu_train": "locu_train_hard.json"
}

# Instabase load
# fs_train = pd.read_json(ib.open(PATH + FILES["foursquare_train"]))
# fs_test = pd.read_json(ib.open(PATH + FILES["foursquare_test"]))
# lc_train = pd.read_json(ib.open(PATH + FILES["locu_train"]))
# lc_test = pd.read_json(ib.open(PATH + FILES["locu_test"]))
# matches = pd.read_csv(ib.open(PATH + FILES["matches"]))

# Local load
fs_train = pd.read_json('data/foursquare_train_hard.json')
fs_test = pd.read_json('data/foursquare_test_hard.json')
lc_train = pd.read_json('data/locu_train_hard.json')
lc_test = pd.read_json('data/locu_test_hard.json')
matches = pd.read_csv('data/matches_train_hard.csv')

### Miscellaneous Functions

In [2]:
def find_distance(pt1, pt2):
    return math.sqrt( (pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2 )
    
def string_similarity(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def calc_lcs(s1, s2):
    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in xrange(1, 1 + len(s1)):
        for y in xrange(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
                
    return len(s1[x_longest - longest: x_longest])

# Normalizes street addresses using the streetaddress library. All normalized fields are added as columns
def addr_parse(address):
    if address is not None: 
        addr_parser = sa.StreetAddressParser()
        addr = addr_parser.parse(address)
        format = {'house': [addr['house']],
                  'street_name': [addr['street_name']],
                  'street_type': [addr['street_type']],
                  'suite_num': [addr['suite_num']],
                  'suite_type': [addr['suite_type']] }   
    else: 
        format = {'house': [None],
                  'street_name': [None],
                  'street_type': [None],
                  'suite_num': [None],
                  'suite_type': [None] }
    rv = pd.DataFrame(data = format)
    return rv

### Cleanup

In [3]:
data_list = {'fs_train': fs_train,
             'fs_test': fs_test,
             'lc_train': lc_train,
             'lc_test': lc_test }

fs_train_phone_dir, fs_test_phone_dir = {}, {}
lc_train_phone_dir, lc_test_phone_dir = {}, {}
phone_dir = {'fs_train': fs_train_phone_dir,
                   'fs_test': fs_test_phone_dir,
                   'lc_train': lc_train_phone_dir,
                   'lc_test': lc_test_phone_dir }

for df_name, df in data_list.iteritems():
    df.drop(['country', 'region', 'locality'], inplace=True, axis=1)
    
    df.replace([''], [None], inplace=True)
    
    df['id'] = df['id'].astype('str')
    df['latitude'] = pd.to_numeric(df['latitude'])
    df['longitude'] = pd.to_numeric(df['longitude'])
    
    # Unicode chars to replace
    df['name'].replace([u"\xe9"], ['e'], regex=True, inplace=True)
    df['name'].replace([u"\xed"], ['i'], regex=True, inplace=True)
    df['name'].replace([u'\u2019'], [''], regex=True, inplace=True)
    df['name'].replace([u'\xc7'], ['c'], regex=True, inplace=True)
    df['name'].replace([u'\u2013'], ['-'], regex=True, inplace=True)
    
    df['name'].replace([r':|\'|,|\.|-'], [''], regex=True, inplace=True)
    df['name'].replace(['&'], ['and'], regex=True, inplace=True)
    df['name'].replace(['\s+|\/'], [' '], regex=True, inplace=True)
    df['name'].replace(['\s'], [''], regex=True, inplace=True)

    df['name'] = df['name'].astype(str).str.lower()
    
    df['phone'].replace([r'\(|\)|\s|-'], [''], regex=True, inplace=True)
    
    # Make a phone directory
    current_phone_dir = phone_dir[df_name]
    for i, row in df.iterrows():
        if row['phone'] != None:
            current_phone_dir[row['phone']] = row['id']
    
    df['street_address'].replace([r'<sup>|<\/sup>'], [''], regex=True, inplace=True)
    df['street_address'].replace([r'\.'], [''], regex=True, inplace=True)
    df['street_address'].replace([r'Jfk'], ['John F Kennedy'], regex=True, inplace=True)
    df['street_address'] = df['street_address'].astype(str)
    
    df['website'].replace([u"\u200e"], [''], regex=True, inplace=True)
    df['website'].replace([r'http(s)?://(www.)?|\\u200e'], [''], regex=True, inplace=True)
    df['website'].replace([r'\..*'], [''], regex=True, inplace=True)
    df['website'] = df['website'].astype(str).str.lower()
    df['website'].replace(['None'], [None], inplace=True)
    
    
c = fs_train['street_address'].apply(addr_parse)
cols = pd.concat([i for i in c]).reset_index(drop=True)
fs_train = pd.concat([fs_train,cols], axis = 1)

c = fs_test['street_address'].apply(addr_parse)
cols = pd.concat([i for i in c]).reset_index(drop=True)
fs_test = pd.concat([fs_test,cols], axis = 1)

c = lc_train['street_address'].apply(addr_parse)
cols = pd.concat([i for i in c]).reset_index(drop=True)
lc_train = pd.concat([lc_train,cols], axis = 1)

c = lc_test['street_address'].apply(addr_parse)
cols = pd.concat([i for i in c]).reset_index(drop=True)
lc_test = pd.concat([lc_test,cols], axis = 1)

## Phone number matching
Entities with matching phone numbers are always matching, so we process these first and reduce the size of the testing set

In [4]:
matches_train = {}
for lc_phone in lc_train_phone_dir:
    if lc_phone in fs_train_phone_dir:
        lc_id = lc_train_phone_dir[lc_phone]
        fs_id = fs_train_phone_dir[lc_phone]
        matches_train[lc_id] = fs_id

matches_test = {}
for lc_phone in lc_test_phone_dir:
    if lc_phone in fs_test_phone_dir:
        lc_id = lc_test_phone_dir[lc_phone]
        fs_id = fs_test_phone_dir[lc_phone]
        matches_test[lc_id] = fs_id

Here we confirm that the accuracy is nearly 100% of the training set using phone number matching

In [51]:
how_true = []
for lc_id, fs_id in matches_train.iteritems():
    fs_match_id = matches[matches['locu_id'] == lc_id]['foursquare_id']
    if len(fs_match_id > 0):
        how_true.append(fs_match_id.iloc[0] == fs_id)
print(sum(how_true) / float(len(matches_train)))

0.991189427313


In [6]:
# Data sets with phone matches removed
lc_train_not_matching = [not x for x in lc_train['id'].isin(matches_train.keys())]
fs_train_not_matching = [not x for x in fs_train['id'].isin(matches_train.values())]
lc_test_not_matching = [not x for x in lc_test['id'].isin(matches_test.keys())]
fs_test_not_matching = [not x for x in fs_test['id'].isin(matches_test.values())]

lc_train = lc_train[lc_train_not_matching]
fs_train = fs_train[fs_train_not_matching]
lc_test = lc_test[lc_test_not_matching]
fs_test = fs_test[fs_test_not_matching]

## Create Row Combinations for Machine Learning
Append a prefix to identify the columns when concatenated:

In [7]:
for df in [fs_train, fs_test]:
    df.columns = ['fs_' + str(i) for i in list(df.columns)]
for df in [lc_train, lc_test]:
    df.columns = ['lc_' + str(i) for i in list(df.columns)]

LC data is repeated row at a time, then FS data is repeated entirely at a time. The two are concatenated to create the combo data frame.

In [8]:
train_left =  lc_train.loc[np.repeat(lc_train.index.values, len(lc_train))].reset_index(drop=True)
train_right =  pd.concat([fs_train]*len(fs_train), ignore_index=True)
train = pd.concat([train_left, train_right], axis=1)

test_left =  lc_test.loc[np.repeat(lc_test.index.values, len(lc_test))].reset_index(drop=True)
test_right =  pd.concat([fs_test]*len(fs_test), ignore_index=True)
test = pd.concat([test_left, test_right], axis=1)

### Add match status

In [9]:
# Match dictionary
match_dict = {}
for i, row in matches.iterrows():
    match_dict[row['locu_id']] = row['foursquare_id']

In [10]:
match_column = []
for i, row in train.iterrows():
    lc_id = row['lc_id']
    fs_id = row['fs_id']
    if (lc_id in match_dict) and (match_dict[lc_id] == fs_id):
        match_column.append(1)
    else:
        match_column.append(0)
match_column = np.array(match_column)

In [17]:
train

Unnamed: 0,lc_id,lc_latitude,lc_longitude,lc_name,lc_phone,lc_postal_code,lc_street_address,lc_website,lc_house,lc_street_name,...,fs_phone,fs_postal_code,fs_street_address,fs_website,fs_house,fs_street_name,fs_street_type,fs_suite_num,fs_suite_type,match
0,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,,,none,,,,,,0
1,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,10025,W 97th St,none,,W 97th,St,,,0
2,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,10128,231 E 96th St Apt 6r,none,231,E 96th,St,6r,Apt,0
3,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,2124918680,10037,535 Malcolm X Blvd,none,535,Malcolm X,Blvd,,,0
4,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,,Stone St,none,,Stone,St,,,0
5,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,2129299600,10014,32 Gansevoort St,redmarketnyc,32,Gansevoort,St,,,0
6,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,10014,,none,,,,,,0
7,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,10023,,none,,,,,,0
8,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,2127442200,,101 W 31st St,none,101,W 31st,St,,,0
9,81df045e563fb6cab7f7,40.810765,-73.952591,honeysaloninc,2126630100,10026,174 Saint Nicholas Ave,none,174,Saint Nicholas,...,,10012,,none,,,,,,0


### Calculate various distances

In [11]:
data_list = [train, test]

for d_i, df in enumerate(data_list):
    print("#####\nStarting iteration #{}".format(d_i))
    
    print('Processing distances...')
    distance = []
    for i, row in df.iterrows():
        lc_loc = (row['lc_latitude'], row['lc_longitude'])
        fs_loc = (row['fs_latitude'], row['fs_longitude'])
        distance.append(find_distance(lc_loc, fs_loc))

    print('Processing names...')
    name_dist = []
    for i, row in df.iterrows():
        lc_name = row['lc_name']
        fs_name = row['fs_name']
        name_dist.append(string_similarity(lc_name, fs_name))

    print('Processing ZIP codes...')
    zip_dist = []
    zip_missing = []
    for i, row in df.iterrows():
        lc_zip = row['lc_postal_code']
        fs_zip = row['fs_postal_code']
        if lc_zip and fs_zip:
            zip_dist.append(string_similarity(lc_zip, fs_zip))
            zip_missing.append(0)
        else:
            zip_dist.append(np.nan)
            zip_missing.append(1)

    print('Processing phone numbers...')
    phone_dist = []
    phone_missing = []
    for i, row in df.iterrows():
        lc_phone = row['lc_phone']
        fs_phone = row['fs_phone']
        if lc_phone and fs_phone:
            phone_dist.append(string_similarity(lc_phone, fs_phone))
            phone_missing.append(0)
        else:
            phone_dist.append(np.nan)
            phone_missing.append(1)

    print('Processing URLs...')
    url_dist = []
    url_missing = []
    for i, row in df.iterrows():
        lc_url = row['lc_website']
        fs_url = row['fs_website']
        if lc_url and fs_url:
            url_dist.append(string_similarity(lc_url, fs_url))
            url_missing.append(0)
        else:
            url_dist.append(np.nan)
            url_missing.append(1)
            
    print('Processing street addresses...')
    house_sim, house_missing = [], []
    street_name_sim, street_name_missing = [], []
    street_type_sim, street_type_missing = [], []
    suite_num_sim, suite_num_missing = [], []
    suite_type_sim, suite_type_missing = [], []
    for i, row in df.iterrows():
        lc_house = row['lc_house']
        fs_house = row['fs_house']
        
        lc_street_name = row['lc_street_name']
        fs_street_name = row['fs_street_name']
        
        lc_street_type = row['lc_street_type']
        fs_street_type = row['fs_street_type']
        
        lc_suite_num = row['lc_suite_num']
        fs_suite_num = row['fs_suite_num']
        
        lc_suite_type = row['lc_suite_type']
        fs_suite_type = row['fs_suite_type']
        
        if lc_house and fs_house:
            house_sim.append(string_similarity(lc_house, fs_house))
            house_missing.append(0)
        else:
            house_sim.append(np.nan)
            house_missing.append(1)
        
        if lc_street_name and fs_street_name:
            street_name_sim.append(string_similarity(lc_street_name, fs_street_name))
            street_name_missing.append(0)
        else:
            street_name_sim.append(np.nan)
            street_name_missing.append(1)
            
        if lc_street_type and fs_street_type:
            street_type_sim.append(string_similarity(lc_street_type, fs_street_type))
            street_type_missing.append(0)
        else:
            street_type_sim.append(np.nan)
            street_type_missing.append(1)
        
        if lc_suite_num and fs_suite_num:
            suite_num_sim.append(string_similarity(lc_suite_num, fs_suite_num))
            suite_num_missing.append(0)
        else:
            suite_num_sim.append(np.nan)
            suite_num_missing.append(1)
        
        if lc_suite_type and fs_suite_type:
            suite_type_sim.append(string_similarity(lc_suite_type, fs_suite_type))
            suite_type_missing.append(0)
        else:
            suite_type_sim.append(np.nan)
            suite_type_missing.append(1)
            
    print('Processing LCS...')
    lcs = []
    for i, row in df.iterrows():
        lc_name = row['lc_name']
        fs_name = row['fs_name']
        lcs.append(calc_lcs(lc_name, fs_name))
    
    d = {'distance': distance,
         'name_sim': name_dist,
         'zip_sim': zip_dist,  'zip_missing': zip_missing,
         'phone_sim': phone_dist, 'phone_missing': phone_missing,
         'url_sim': url_dist, 'url_missing': url_missing,
         'house_sim': house_sim, 'house_missing': house_missing,
         'street_name_sim': street_name_sim, 'street_name_missing': street_name_missing,
         'street_type_sim': street_type_sim, 'street_type_missing': street_type_missing,
         'suite_num_sim': suite_num_sim, 'suite_num_missing': suite_num_missing,
         'suite_type_sim': suite_type_sim, 'suite_type_missing': suite_type_missing,
         'lcs': lcs }
    
    if d_i == 0:
        train_data = pd.DataFrame(d).fillna(0)
    else:
        test_data = pd.DataFrame(d).fillna(0)

print("#####\nProcessing Finished!")

#####
Starting iteration #0
Processing distances...
Processing names...
Processing ZIP codes...
Processing phone numbers...
Processing URLs...
Processing street addresses...
Processing LCS...
#####
Starting iteration #1
Processing distances...
Processing names...
Processing ZIP codes...
Processing phone numbers...
Processing URLs...
Processing street addresses...
Processing LCS...
#####
Processing Finished!


### Impute missing values (replace with mean value)

In [101]:
# from sklearn.preprocessing import Imputer

# ip = Imputer(missing_values = 'NaN')
# ip.fit(pd.concat([train_data, test_data], axis=0))

# train_data = pd.DataFrame(ip.fit_transform(train_data))
# test_data = pd.DataFrame(ip.fit_transform(test_data))

## Model Training
* To-do: Hyperparameter Tuning

### Classifiers

In [12]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

def cv_run_ada(train_data, train_labels, test_data, test_labels):
    model = AdaBoostClassifier(random_state=1).fit(train_data, train_labels)
    return model.predict(test_data)

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=1.0, random_state=1).fit(train_data, train_labels)
    return model.predict(test_data)

def cv_run_et(train_data, train_labels, test_data, test_labels):
    model = ExtraTreesClassifier(n_estimators=100, max_features=None, random_state=1).fit(train_data, train_labels)
    return model.predict(test_data)

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(n_estimators=100, max_features=None, n_jobs=-1, random_state=1).fit(train_data, train_labels)
    return model.predict(test_data)

def cv_run_dt(train_data, train_labels, test_data, test_labels):
    model = DecisionTreeClassifier(max_features=None, random_state=1).fit(train_data, train_labels)
    return model.predict(test_data)

def cv_run_xg(train_data, train_labels, test_data, test_labels):
    model = XGBClassifier().fit(train_data, train_labels)
    return model.predict(test_data)



### Cross-validation

In [13]:
skf = StratifiedKFold(match_column, n_folds=10, random_state=1, shuffle=True)

overall_corr = 0
wrong_indices = []
for train_index, test_index in skf:
    cv_train_data = train_data.loc[train_index]
    cv_train_labels = match_column[train_index]
    cv_test_data = train_data.loc[test_index]
    cv_test_labels = match_column[test_index]
    
    preds = cv_run_dt(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)

    fold_corr = sum(preds[cv_test_labels == 1])
    overall_corr += fold_corr
    
    # Collect wrong indices to check
    wrong_ix = [not x for x in preds[cv_test_labels == 1]]
    wrong_indices += list(cv_test_data[cv_test_labels == 1][wrong_ix].index)
        
    fold_acc = fold_corr / float(sum(cv_test_labels))
    print(fold_acc)
    
print("Overall Recall: {}".format(float(overall_corr) / sum(match_column)))

# Test Recall Scores
#######
# With phone matches removed, missing values replaced with 0 (10-fold)
# Ada: 0.932330827068
# Bag: 0.917293233083
# DT: 0.917293233083
# ET: 0.932330827068
# RF: 0.917293233083
# RF, n_estimators=100: 0.924812030075
# XG: 0.90977443609
#######
# With phone matches removed, missing values replaced with 0 (5-fold)
# Ada: 0.917293233083
# Bag: 0.90977443609
# DT: 0.917293233083
# ET: 0.924812030075
# RF: 0.917293233083
# RF, n_estimators=100: 0.924812030075
# XG: 0.90977443609

0.928571428571
0.857142857143
0.857142857143
0.923076923077
1.0
0.846153846154
0.846153846154
0.923076923077
1.0
1.0
Overall Recall: 0.917293233083


### Misclassified examples

In [15]:
train_data.iloc[wrong_indices, :].transpose()

Unnamed: 0,114042,7130,19997,22458,72708,131825,90559,105493,42245,56073,93515
distance,0.0,0.005304,0.006468,0.008889,0.08644,0.02508,0.000234,0.004729,0.000177,0.062195,0.066754
house_missing,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
house_sim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
lcs,12.0,14.0,17.0,4.0,4.0,5.0,7.0,8.0,6.0,6.0,3.0
name_sim,1.0,1.0,0.878049,0.555556,0.421053,0.47619,0.533333,0.5,0.689655,0.634146,0.375
phone_missing,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
phone_sim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
street_name_missing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
street_name_sim,0.0,0.166667,0.727273,0.090909,0.666667,0.2,0.0,0.166667,1.0,0.333333,0.0
street_type_missing,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


In [14]:
train.iloc[wrong_indices, :].transpose()

Unnamed: 0,114042,7130,19997,22458,72708,131825,90559,105493,42245,56073,93515
lc_id,8c8d4da92dff0657702d,80afa95c01dae3ba5434,825acefd3e298274a150,c170270283ef870d546b,edeba23f215dcc702220,493f5e2798de851ec3b2,f7bb0b23ce99cddcd5c3,212dffb393f745df801a,0a28f2497518cc19f7e5,5f3fd107090d0ddc658b,e3f9d84c0c989f2e7928
lc_latitude,40.7493,40.7285,40.6438,40.7662,40.7776,40.7582,40.7223,40.7398,40.7206,40.714,40.7746
lc_longitude,-73.9771,-74.0001,-73.782,-73.9778,-73.9457,-73.9923,-73.988,-73.9896,-73.9854,-73.9969,-73.9573
lc_name,30parkavenue,lepoissonrouge,greenwichvillagebistro,exhalespa,yorkgrill,pickabagel,karaokebohoorchard,brioflatiron,azulargentinebistro,tsungsunsocialclub,lukes
lc_phone,2125837200,2125053474,7187512890,2125617400,2127720291,2127928008,2127770102,2126732121,6466022004,2122269414,2122497070
lc_postal_code,10065,10012,11430,10019,10128,10036,10002,10003,10002,10002,10075
lc_street_address,583 Park Ave,158 Bleecker St,John F Kennedy International Airport,150 Central Park South,1690 York Ave,360 W 42nd St,196 Orchard St,920 Broadway,152 Stanton St,11 Division St,1394 3rd Ave
lc_website,583parkave,lepoissonrouge,none,exhalespa,yorkgrillnyc,pickabagel42ndstreetnyc,karaokeboho,brioflatiron,azulnyc,none,lukesbarandgrill
lc_house,583,158,,150,1690,360,196,920,152,11,1394
lc_street_name,Park,Bleecker,John F Kennedy International Airport,Central Park South,York,W 42nd,Orchard,Broadway,Stanton,Division,3rd


## Prediction preparation
### Model training and prediction

In [16]:
model = AdaBoostClassifier(random_state=1).fit(train_data, match_column)
labels = model.predict(test_data)

### Combine with the phone matching set then export

In [44]:
# Build and export the file
lc_col = test['lc_id'][labels.astype(bool)]
fs_col = test['fs_id'][labels.astype(bool)]

for lc_id, fs_id in matches_test.iteritems():
    lc_col = lc_col.append(pd.Series(lc_id))
    fs_col = fs_col.append(pd.Series(fs_id))

output = pd.concat([lc_col, fs_col], axis=1)
output.columns = ['locu_id', 'foursquare_id']

with open('results/20160415.csv', 'w') as f:
    output.to_csv(f, index=False, columns = ['locu_id', 'foursquare_id'])
    
# # Instabase version
# username = "woojink"
# repo = "best-entity-resolvers"
# with ib.open('/{0}/{1}/fs/Instabase%20Drive/files/matches.csv'.format(username,repo)) as f:
#     writer = csv.writer(f)
#     header = ['locu_id', 'foursquare_id']
#     writer.writerow(header)
#     for key, val in matches_pred_test.iteritems():
#         writer.writerow([key, val])