In [878]:
import json
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import editdistance as ed
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
from sklearn.preprocessing import PolynomialFeatures as Poly
import re
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [126]:
def clean_data(locu_path,fs_path):
        
    #load the data
    foursquare_data = pd.read_json(open(fs_path))
    locu_data = pd.read_json(open(locu_path))

    #drop the columns that have most of the columns same
    foursquare = foursquare_data.drop(['country', 'region', 'locality'], axis=1)
    locu = locu_data.drop(['country', 'region', 'locality'], axis=1)

    def cleanPhone(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('(', '').replace(')', '').replace('-', '').replace(' ', '')

    def cleanWebsite(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('https://', '').replace('http://', '').replace('www.', '')

    def cleanName(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'s', '').replace('#', '').replace('&', '').replace('-', ' ').replace('/', ' ')
            x = x.lower()
            return x

    def cleanAddress(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'', '').replace('#', '').replace('&', '').replace('.', '')\
                .replace('@', '').replace(',', '').replace('-', ' ').replace('/', ' ')
            x = x.lower()
            x = x.replace('street', 'st').replace('avenue', 'ave').replace('boulevard', 'blvd')\
                    .replace('place', 'pl').replace('square', 'sq').replace('plaza', 'plz')
            x = " "+x
            x = x.replace(" zero","0").replace(" one","1").replace(" two","2").replace(" three","3")\
                .replace(" four","4").replace(" five","5").replace(" six","6").replace(" seven","7")\
                .replace(" eight","8").replace(" nine","9")
            x = x.strip()
            return x

    foursquare['phone'] = foursquare['phone'].map(cleanPhone)
    foursquare['website'] = foursquare['website'].map(cleanWebsite)
    foursquare['name'] = foursquare['name'].map(cleanName)
    foursquare['street_address'] = foursquare['street_address'].map(cleanAddress)

    locu['phone'] = locu['phone'].map(cleanPhone)
    locu['website'] = locu['website'].map(cleanWebsite)
    locu['name'] = locu['name'].map(cleanName)
    locu['street_address'] = locu['street_address'].map(cleanAddress)

    f=open("fs_clean.json","w")
    f.write(foursquare.to_json(orient='records'))
    f=open("lc_clean.json","w")
    f.write(locu.to_json(orient='records'))

In [62]:
def binary_search(array, target, key):
    lower = 0
    upper = len(array)
    while lower < upper:
        x = lower + int((upper - lower)/2)
        val = array[x][key]
        if target == val:
            return x
        elif target > val:
            if lower == x:
                return lower
            lower = x
        elif target < val:
            upper = x
    return lower

In [356]:
def get_matches(locu_train_path, foursquare_train_path, matches_train_path, locu_test_path, foursquare_test_path):
    
    clean_data(locu_train_path, foursquare_train_path)

    matches = open(matches_train_path).readlines()[1:]
    locu_match = {}
    for l in matches:
        l = l.strip().split(',')
        locu_match[l[0]] = l[1]

    foursquare_data = json.load(open('fs_clean.json'))
    locu_data = json.load(open('lc_clean.json'))
    foursquare_data = np.array(foursquare_data)
    locu_data = np.array(locu_data)

    fs_id_ind = {}
    lo_id_ind = {}
    for l in locu_data:
        lo_id_ind[l['id']] = l
    fs_phone_ind = {}
    fs_name_ind = {}
    for l in foursquare_data:
        fs_id_ind[l['id']] = l
        if l['phone'] is not None and l['phone']!='':
            fs_phone_ind[l['phone']] = l
        if l['name'].replace(" ","") not in fs_name_ind:
            fs_name_ind[l['name'].replace(" ","")] = set()
        fs_name_ind[l['name'].replace(" ","")].add(l['id'])

    foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
    foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

    tot = 0
    param_lat = 0.0015
    param_lon = 0.0015
    ann = {}
    for index,l in enumerate(locu_data):
        if l['latitude'] is not None and l['longitude'] is not None:
            lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
            lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
            lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
            lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
            setA = set()
            for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
                setA.add(foursquare_sortedx[i]['id'])
            setB = set()
            for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
                setB.add(foursquare_sortedy[i]['id'])
            setC = setA.intersection(setB)
            if l['name'].replace(" ","") in fs_name_ind:
                setC.update(fs_name_ind[l['name'].replace(" ","")])
            if l['phone'] in fs_phone_ind:
                setC.add(fs_phone_ind[l['phone']]['id'])
            ann[index] = setC
            tot+=len(setC)

    x_train = []
    y_train = []
    x_test = []
    y_test = []
    X = []
    y = []
    c = 0
    for l in ann:
        c+=1
        for fid in ann[l]:
            l1 = locu_data[l]
            l2 = fs_id_ind[fid]
            if l1['id'] in locu_match and locu_match[l1['id']]==fid:
                label=1
            else:
                label=0
            
            f_code = 0
            if l1['postal_code']=='' or l2['postal_code']=='' or l1['postal_code']==l2['postal_code']:
                f_code=1
            
            f_phone = 0
            if l1['phone']=='' or l2['phone']=='' or l1['phone']==l2['phone']:
                f_phone=1
                
            f_name = 0
            if l1['name']=='' or l2['name']=='':
                f_name = 0
            else:
                f_name = ed.eval(l1['name'],l2['name'])/(len(l1['name'])+len(l2['name']))
            
            f_add = 0
            if l1['street_address']=='' or l2['street_address']=='':
                f_add = 0
            else:
                f_add = ed.eval(l1['street_address'],l2['street_address'])/(len(l1['street_address'])+len(l2['street_address']))
            
            f_web = 0
            if l1['website']=='' or l2['website']=='':
                f_web = 0
            else:
                f_web = ed.eval(l1['website'],l2['website'])/(len(l1['website'])+len(l2['website']))   
            
            if c<=450:
                x_train.append([f_code,f_phone,f_name,f_add,f_web])
                y_train.append(label)
            else:
                x_test.append([f_code,f_phone,f_name,f_add,f_web])
                y_test.append(label)
            X.append([f_code,f_phone,f_name,f_add,f_web])
            y.append(label)

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    l_max = 0
    mval = 0
    for l in range(10,201,5):
        y_pred = RFC(n_estimators=l).fit(x_train,y_train).predict(x_test)
        print(f1_score(y_pred,y_test),precision_score(y_pred,y_test),recall_score(y_pred,y_test))
        if f1_score(y_pred,y_test) > mval:
            mval= f1_score(y_pred,y_test)
            l_max = l

    rfc = RFC(n_estimators=l_max).fit(X,y)
    y_pred = rfc.predict(X)

    print(f1_score(y_pred,y),precision_score(y_pred,y),recall_score(y_pred,y))

    clean_data(locu_test_path,foursquare_test_path)

    foursquare_data = json.load(open('fs_clean.json'))
    locu_data = json.load(open('lc_clean.json'))
    foursquare_data = np.array(foursquare_data)
    locu_data = np.array(locu_data)

    fs_id_ind = {}
    lo_id_ind = {}
    for l in locu_data:
        lo_id_ind[l['id']] = l
    fs_phone_ind = {}
    fs_name_ind = {}
    for l in foursquare_data:
        fs_id_ind[l['id']] = l
        if l['phone'] is not None and l['phone']!='':
            fs_phone_ind[l['phone']] = l
        if l['name'].replace(" ","") not in fs_name_ind:
            fs_name_ind[l['name'].replace(" ","")] = set()
        fs_name_ind[l['name'].replace(" ","")].add(l['id'])

    foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
    foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

    tot = 0
    ann = {}
    for index,l in enumerate(locu_data):
        if l['latitude'] is not None and l['longitude'] is not None:
            lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
            lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
            lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
            lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
            setA = set()
            for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
                setA.add(foursquare_sortedx[i]['id'])
            setB = set()
            for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
                setB.add(foursquare_sortedy[i]['id'])
            setC = setA.intersection(setB)
            if l['name'].replace(" ","") in fs_name_ind:
                setC.update(fs_name_ind[l['name'].replace(" ","")])
            if l['phone'] in fs_phone_ind:
                setC.add(fs_phone_ind[l['phone']]['id'])
            ann[index] = setC
            tot+=len(setC)
    print(tot)

    X = []
    c = 0
    pairs = []
    for l in ann:
        c+=1
        for fid in ann[l]:
            l1 = locu_data[l]
            l2 = fs_id_ind[fid]

            pairs.append((l1['id'],l2['id']))
            
            f_code = 0
            if l1['postal_code']=='' or l2['postal_code']=='' or l1['postal_code']==l2['postal_code']:
                f_code=1
            
            f_phone = 0
            if l1['phone']=='' or l2['phone']=='' or l1['phone']==l2['phone']:
                f_phone=1
                
            f_name = 0
            if l1['name']=='' or l2['name']=='':
                f_name = 0
            else:
                f_name = ed.eval(l1['name'],l2['name'])/(len(l1['name'])+len(l2['name']))
            
            f_add = 0
            if l1['street_address']=='' or l2['street_address']=='':
                f_add = 0
            else:
                f_add = ed.eval(l1['street_address'],l2['street_address'])/(len(l1['street_address'])+len(l2['street_address']))
            
            f_web = 0
            if l1['website']=='' or l2['website']=='':
                f_web = 0
            else:
                f_web = ed.eval(l1['website'],l2['website'])/(len(l1['website'])+len(l2['website']))

            X.append([f_code,f_phone,f_name,f_add,f_web])

    y_pred = rfc.predict(X)


    f = open('matches_test.csv','w')
    f.write("locu_id,foursquare_id\n")
    for l in range(len(y_pred)):
        if y_pred[l]==1:
            l = pairs[l]
            f.write(str(l[0])+','+str(l[1])+'\n')
    f.close()



In [357]:
get_matches("train/locu_train.json","train/foursquare_train.json","train/matches_train.csv",
            "test/locu_test.json","test/foursquare_test.json")

TypeError: clean_data() missing 1 required positional argument: 'lc'

In [815]:
def clean_data(path,fs,lc):
    
    #load the data
    foursquare_data = pd.read_json(open(path+"/"+fs+'.json'))
    locu_data = pd.read_json(open(path+"/"+lc+'.json'))

    #drop the columns that have most of the columns same
    foursquare = foursquare_data.drop(['country', 'region', 'locality'], axis=1)
    locu = locu_data.drop(['country', 'region', 'locality'], axis=1)

    def cleanPhone(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('(', '').replace(')', '').replace('-', '').replace(' ', '')

    def cleanWebsite(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('https://', '').replace('http://', '').replace('www.', '')

    def cleanName(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'', '').replace('#', '').replace('&', '').replace('-', ' ').replace('/', ' ')\
                .replace(":","")
            x = x.lower()
            return x

    def cleanAddress(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'', '').replace('#', '').replace('&', '').replace('.', '').replace('@', '').\
            replace(',', '').replace('-', ' ').replace('/', ' ').replace(":","")
            x = x.lower()
            x = x.replace('street', 'st').replace('avenue', 'ave').replace('boulevard', 'blvd').\
            replace('place', 'pl').replace('square', 'sq').replace('plaza', 'plz')
            x = " "+x
            x = x.replace(" zero","0").replace(" one","1").replace(" two","2").replace(" three","3")\
                .replace(" four","4").replace(" five","5").replace(" six","6").replace(" seven","7")\
                .replace(" eight","8").replace(" nine","9")
            x = x.strip()
            return x
            return x

    foursquare['phone'] = foursquare['phone'].map(cleanPhone)
    foursquare['website'] = foursquare['website'].map(cleanWebsite)
    foursquare['name'] = foursquare['name'].map(cleanName)
    foursquare['street_address'] = foursquare['street_address'].map(cleanAddress)

    locu['phone'] = locu['phone'].map(cleanPhone)
    locu['website'] = locu['website'].map(cleanWebsite)
    locu['name'] = locu['name'].map(cleanName)
    locu['street_address'] = locu['street_address'].map(cleanAddress)

    f=open("fs_clean.json","w")
    f.write(foursquare.to_json(orient='records'))
    f=open("lc_clean.json","w")
    f.write(locu.to_json(orient='records'))

In [879]:
def binary_search(array, target, key):
    lower = 0
    upper = len(array)
    while lower < upper:
        x = lower + int((upper - lower)/2)
        val = array[x][key]
        if target == val:
            return x
        elif target > val:
            if lower == x:
                return lower
            lower = x
        elif target < val:
            upper = x
    return lower

In [880]:
clean_data("train","foursquare_train","locu_train")

In [881]:
matches = open('train/matches_train.csv').readlines()[1:]
locu_mat = {}
ch = {}
for l in matches:
    l = l.strip().split(',')
    locu_match[l[0]] = l[1]

In [882]:
foursquare_data = json.load(open('fs_clean.json'))
locu_data = json.load(open('lc_clean.json'))
foursquare_data = np.array(foursquare_data)
locu_data = np.array(locu_data)

fs_id_ind = {}
lo_id_ind = {}
for l in locu_data:
    lo_id_ind[l['id']] = l
fs_phone_ind = {}
fs_name_ind = {}
for l in foursquare_data:
    fs_id_ind[l['id']] = l
    if l['phone'] is not None and l['phone']!='':
        fs_phone_ind[l['phone']] = l
    if l['name'].replace(" ","") not in fs_name_ind:
        fs_name_ind[l['name'].replace(" ","")] = set()
    fs_name_ind[l['name'].replace(" ","")].add(l['id'])

foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

param_lat = 0.002
param_lon = 0.002
tot = 0
c=0
ann = {}
for index,l in enumerate(locu_data):
    if l['latitude'] is not None and l['longitude'] is not None:

        lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
        lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
        lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
        lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
        setA = set()
        for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
            setA.add(foursquare_sortedx[i]['id'])
        setB = set()
        for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
            setB.add(foursquare_sortedy[i]['id'])
        setC = setA.intersection(setB)
        if l['name'].replace(" ","") in fs_name_ind:
            setC.update(fs_name_ind[l['name'].replace(" ","")])
        if l['phone'] in fs_phone_ind:
            setC.add(fs_phone_ind[l['phone']]['id'])
        if l['id'] in locu_match:
            if locu_match[l['id']] in setC:
                c+=1
            else:
                setC.add(locu_match[l["id"]])
                print(lo_id_ind[l['id']],fs_id_ind[locu_match[l['id']]])
                print(lat_l,lat_r,lon_l,lon_r)
                print(lo_id_ind[l['id']]['latitude']-fs_id_ind[locu_match[l['id']]]['latitude'])
                print(lo_id_ind[l['id']]['longitude']-fs_id_ind[locu_match[l['id']]]['longitude'])
                print("---------------------------------------------")
        ann[index] = setC
        tot+=len(setC)
    else:
        setC = set()
        if l['name'].replace(" ","") in fs_name_ind:
            setC.update(fs_name_ind[l['name'].replace(" ","")])
        if l['phone'] in fs_phone_ind:
            setC.add(fs_phone_ind[l['phone']]['id'])
        ann[index] = setC
        tot+=len(setC)
print(c,tot)

{'id': 'c170270283ef870d546b', 'latitude': 40.766195, 'longitude': -73.977825, 'name': 'exhale spa', 'phone': '2125617400', 'postal_code': '10019', 'street_address': '150 central park south', 'website': 'exhalespa.com/locations/midtown'} {'id': '51eb7eed498e401ec51196b6', 'latitude': 40.75798, 'longitude': -73.98122, 'name': 'halel cart', 'phone': '', 'postal_code': '', 'street_address': '', 'website': ''}
433 457 379 422
0.008214999999999861
0.0033949999999975944
---------------------------------------------
{'id': '5f3fd107090d0ddc658b', 'latitude': 40.713998, 'longitude': -73.996882, 'name': 'tsung sun social club', 'phone': '2122269414', 'postal_code': '10002', 'street_address': '11 division st', 'website': ''} {'id': '51ce011a498ed8dfb15381bb', 'latitude': 40.7726492359, 'longitude': -73.9761875778, 'name': 'spring social running club', 'phone': '', 'postal_code': '', 'street_address': '', 'website': ''}
32 54 119 159
-0.0586512359000011
-0.020694422199994733
---------------------

In [886]:
def create_features(l1,l2):
    feat = []
    f_code = 0
    if l1['postal_code']=='' or l2['postal_code']=='':
        f_code=0
    elif l1['postal_code']==l2['postal_code']:
        f_code = 1
    feat.append(f_code)

    f_phone = 0
    f_phone_last = 0
    if l1['phone']=='' or l2['phone']=='':
        f_phone=0
        f_phone_last = 0
    else:
        if l1['phone']==l2['phone']:
            f_phone=1
        if l1['phone'][-4:] == l2['phone'][-4:]:
            f_phone_last = 1
    feat.append(f_phone)
    feat.append(f_phone_last)
    
    f_name = 0
    f_name_jaccard = 0
    if l1['name']=='' or l2['name']=='':
        f_name = 0
        f_name_jaccard = 0
    else:
        f_name = ed.eval(l1['name'],l2['name'])/max(len(l1['name']),len(l2['name']))
        setA = set([stemmer.stem(i) for i in l1['name'].split(" ")])
        setB = set([stemmer.stem(i) for i in l2['name'].split(" ")])
        f_name_jaccard = len(setA.intersection(setB))/len(setA.union(setB))
    feat.append(f_name)
    feat.append(f_name_jaccard)

    f_add = 0
    f_add_num = 0
    if l1['street_address']=='' or l2['street_address']=='':
        f_add = 1
        f_add_num = 0
    else:
        f_add = ed.eval(l1['street_address'],l2['street_address'])/max(len(l1['street_address']),len(l2['street_address']))
        l1_num = set(re.findall(r'\d+', l1['street_address']))
        l2_num = set(re.findall(r'\d+', l2['street_address']))
        if len(l1_num.union(l2_num))==0:
            f_add_num = 0
        else:
            f_add_num = len(l1_num.intersection(l2_num))/len(l1_num.union(l2_num))
    feat.append(f_add)
    feat.append(f_add_num)

    f_web = 0
    if l1['website']=='' or l2['website']=='':
        f_web = 1
    else:
        f_web = ed.eval(l1['website'],l2['website'])/max(len(l1['website']),len(l2['website']))
    feat.append(f_web)

    f_lat = 1
    f_lon = 1
    if l1['latitude'] is not None and l2['latitude'] is not None:
        f_lat = l1['latitude']-l2['latitude']
        f_lon = l1['longitude']-l2['longitude']
    feat.append(f_lat)
    feat.append(f_lon)
    
    return feat

In [887]:
x_train = []
y_train = []
x_test = []
y_test = []
X = []
y = []
pairs = []
c = 0
train_sample = 0
for l in ann:
    c+=1
    for fid in ann[l]:
        l1 = locu_data[l]
        l2 = fs_id_ind[fid]
        if l1['id'] in locu_match and locu_match[l1['id']]==fid:
            label=1
        else:
            label=0
        
        pairs.append((l1,l2))
        
        feat = create_features(l1,l2)
        
        if c<=450:
            train_sample+=1
        X.append(feat)
        y.append(label)

In [888]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import f1_score,precision_score, recall_score

In [889]:
X = np.array(X)
y = np.array(y)

poly = Poly()
X = poly.fit_transform(X)
x_train = X[:train_sample]
x_test = X[train_sample:]
y_train = y[:train_sample]
y_test = y[train_sample:]

In [890]:
# l_max = 0
# mval = 0
# for l in range(10,201,5):
#     rfc = RFC(n_estimators=l,random_state=1729).fit(x_train,y_train)
#     y_pred = rfc.predict(x_test)
#     print(f1_score(y_pred,y_test),precision_score(y_pred,y_test),recall_score(y_pred,y_test),accuracy_score(y_pred,y_test))
#     if accuracy_score(y_pred,y_test) > mval:
#         mval= accuracy_score(y_pred,y_test)
#         l_max = l
#     y_pred = rfc.predict(x_train)

rfc = RFC(n_estimators=120).fit(x_train,y_train)
y_pred = rfc.predict(x_test)
rfc = RFC(n_estimators=120).fit(X,y)

In [891]:
c = 0
for l in range(len(y_test)):
    if y_test[l] != y_pred[l]:
        c+=1
        print(pairs[len(y_train)+l],y_test[l],y_pred[l])
print(c)

({'id': '212dffb393f745df801a', 'latitude': 40.739838, 'longitude': -73.98957, 'name': 'brio flatiron', 'phone': '2126732121', 'postal_code': '10003', 'street_address': '920 broadway', 'website': 'brioflatiron.com/'}, {'id': '51e869ac498e7e485cabcdeb', 'latitude': 40.744498, 'longitude': -73.990373, 'name': 'home plate @flatiron school', 'phone': '', 'postal_code': '', 'street_address': '', 'website': ''}) 1 0
({'id': '0b6dfce85d74f1b65f52', 'latitude': 40.781951, 'longitude': -73.975479, 'name': 'west 79 street boat basin cafe', 'phone': '2124965542', 'postal_code': '10024', 'street_address': 'w 79 st', 'website': 'boatbasincafe.com/'}, {'id': '40ede000f964a5203c0a1fe3', 'latitude': 40.785675637, 'longitude': -73.9845085144, 'name': 'boat basin café', 'phone': '2124965542', 'postal_code': '10024', 'street_address': '390 w 79th st', 'website': ''}) 0 1
({'id': '7b0512c99f091ec1e7d0', 'latitude': 40.755168, 'longitude': -73.979409, 'name': 'starbucks', 'phone': '2126871026', 'postal_cod

In [892]:
y_pred = rfc.predict(x_train)
c = 0
for l in range(len(y_train)):
    if y_train[l] != y_pred[l]:
        c+=1
        print(pairs[l],y_train[l],y_pred[l])
print(c)

0


In [868]:
rfc.feature_importances_

array([  0.00000000e+00,   1.97728077e-03,   1.68801431e-02,
         9.57379283e-03,   3.34029971e-02,   7.32061483e-02,
         3.49535951e-02,   4.78186132e-02,   1.88839867e-03,
         9.15217916e-03,   5.87014841e-03,   1.42031227e-03,
         1.30695731e-04,   2.18423211e-04,   8.48837495e-04,
         3.90380176e-02,   6.57525690e-04,   1.17174562e-02,
         1.53001010e-03,   8.03375669e-04,   4.00051589e-04,
         2.18639347e-02,   1.28893841e-02,   2.08220749e-03,
         1.77979882e-02,   1.60454154e-03,   1.10270987e-02,
         4.78514933e-03,   2.27969745e-03,   1.89152828e-03,
         3.32487257e-03,   1.23894941e-03,   2.16240493e-02,
         2.45399079e-03,   6.01610826e-03,   5.74830913e-03,
         4.72688455e-03,   4.57442535e-04,   5.53360093e-02,
         1.07483486e-03,   5.36250585e-02,   2.65658308e-03,
         5.07926691e-02,   1.87443614e-03,   2.98891712e-03,
         8.65100592e-02,   1.54687363e-02,   9.28332949e-02,
         1.43384550e-02,

In [869]:
clean_data("test","foursquare_test","locu_test")

In [870]:
foursquare_data = json.load(open('fs_clean.json'))
locu_data = json.load(open('lc_clean.json'))
foursquare_data = np.array(foursquare_data)
locu_data = np.array(locu_data)

fs_id_ind = {}
lo_id_ind = {}
for l in locu_data:
    lo_id_ind[l['id']] = l
fs_phone_ind = {}
fs_name_ind = {}
for l in foursquare_data:
    fs_id_ind[l['id']] = l
    if l['phone'] is not None and l['phone']!='':
        fs_phone_ind[l['phone']] = l
    if l['name'].replace(" ","") not in fs_name_ind:
        fs_name_ind[l['name'].replace(" ","")] = set()
    fs_name_ind[l['name'].replace(" ","")].add(l['id'])

foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

tot = 0
ann = {}
for index,l in enumerate(locu_data):
    if l['latitude'] is not None and l['longitude'] is not None:
        lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
        lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
        lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
        lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
        setA = set()
        for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
            setA.add(foursquare_sortedx[i]['id'])
        setB = set()
        for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
            setB.add(foursquare_sortedy[i]['id'])
        setC = setA.intersection(setB)
        if l['name'].replace(" ","") in fs_name_ind:
            setC.update(fs_name_ind[l['name'].replace(" ","")])
        if l['phone'] in fs_phone_ind:
            setC.add(fs_phone_ind[l['phone']]['id'])
        ann[index] = setC
        tot+=len(setC)
    else:
        setC = set()
        for i in foursquare_data:
            setC.add(i["id"])
        ann[index] = setC
        tot+=len(setC)
print(tot)

1730


In [871]:
X = []
c = 0
neigh = {}
pairs = []
for l in ann:
    c_old = c
    for fid in ann[l]:
        l1 = locu_data[l]
        l2 = fs_id_ind[fid]

        pairs.append((l1['id'],l2['id']))
        
        feat = create_features(l1,l2)

        X.append(feat)
        
        c+=1
    neigh[l] = (c_old,c)
len(X)

1730

In [872]:
X = poly.transform(X)
y_pred = rfc.predict(X)

In [873]:
sum(y_pred)

634

In [874]:
len(locu_data)

400

In [875]:
y_pred_prob = rfc.predict_proba(X)

In [876]:
for l in range(len(y_pred)):
    if y_pred[l]==1:
        print(y_pred_prob[l], pairs[l])

[ 0.  1.] ('b48da849c54f904013e2', '4de5a887d4c09fc98b6e0498')
[ 0.01666667  0.98333333] ('95ad783fd1c65bb8fdbf', '459f6987f964a520c0401fe3')
[ 0.01666667  0.98333333] ('206c363a5907bfa98ec0', '4fa163a4e4b00c5d71cf15c5')
[ 0.  1.] ('cb95d1e0730222cc3209', '4cc5f773c844721edb15f201')
[ 0.20833333  0.79166667] ('25ca87e725b930488ed6', '4ab26766f964a520146b20e3')
[ 0.  1.] ('4f9710321455164d1cb4', '500a9f95e4b00d76e499d83e')
[ 0.  1.] ('138aed748ccdfed0845d', '4b284ea7f964a520519224e3')
[ 0.  1.] ('5013d96a9633f92f2dbf', '4f3257e619836c91c7ce6b37')
[ 0.  1.] ('b2affc7b53499fd8ee76', '4e4c4f7ebd413c4cc66867f1')
[ 0.  1.] ('7b11cde179779d2e814d', '4d6a8a5d0a25b60cb4743290')
[ 0.  1.] ('1ff1b1c1d5252ca3682a', '4b212d63f964a520c63724e3')
[ 0.  1.] ('417df85b1156d93935b3', '5103351ee4b0ad7f2e01d882')
[ 0.  1.] ('be74338f204c673792a9', '519b78372fc6182159f7eb0d')
[ 0.  1.] ('76b75b0a67c0bcd6682f', '4f329b2519836c91c7e82627')
[ 0.  1.] ('0b349f0d5401a0e1d58e', '4c52c418d797e21e4757137e')
[ 0.  1

[ 0.325  0.675] ('83b07fc4b00eaf63844e', '4f44fb6219836ed00197e7de')
[ 0.29166667  0.70833333] ('83b07fc4b00eaf63844e', '4f3c8967e4b02fa0cf4e9148')
[ 0.35  0.65] ('83b07fc4b00eaf63844e', '4d8a588226a36ea870ecf2ad')
[ 0.34166667  0.65833333] ('83b07fc4b00eaf63844e', '4f5d81dfe4b0c8398471bdc7')
[ 0.375  0.625] ('83b07fc4b00eaf63844e', '4c3c85e3a9509c74b52e395b')
[ 0.33333333  0.66666667] ('83b07fc4b00eaf63844e', '4f32667719836c91c7d41837')
[ 0.35  0.65] ('83b07fc4b00eaf63844e', '3fd66200f964a520aee41ee3')
[ 0.39166667  0.60833333] ('83b07fc4b00eaf63844e', '4c62d76879d1e21e5b10d715')
[ 0.43333333  0.56666667] ('83b07fc4b00eaf63844e', '4dbceae34b222080d39eb570')
[ 0.3  0.7] ('83b07fc4b00eaf63844e', '4d6a8a5d0a25b60cb4743290')
[ 0.35  0.65] ('83b07fc4b00eaf63844e', '4f2ed62ae4b010c5f73c907e')
[ 0.35  0.65] ('83b07fc4b00eaf63844e', '4f32240c19836c91c7ba2429')
[ 0.39166667  0.60833333] ('83b07fc4b00eaf63844e', '4d06fb1054d0236a5f13fcd5')
[ 0.325  0.675] ('83b07fc4b00eaf63844e', '4aa3f1fff964a

In [877]:
f = open('matches_test.csv','w')
f.write("locu_id,foursquare_id\n")
unique_locu = {}
unique_forsquare = {}
for l in range(len(y_pred)):
    if y_pred[l]==1:
        l = pairs[l]
        f.write(str(l[0])+','+str(l[1])+'\n')
        if l[0] in unique_locu:
            print(l[0],l[1])
        if l[1] in unique_forsquare:
            print(l[0],l[1])
        unique_locu[l[0]] = 0
        unique_forsquare[l[1]] = 0
f.close()

83b07fc4b00eaf63844e 506f5304e4b04b4e0c1b1964
83b07fc4b00eaf63844e 4bc0f15f4cdfc9b6b0d29321
83b07fc4b00eaf63844e 4e2a37d37d8b7deda6c3cff4
83b07fc4b00eaf63844e 4e2a37d37d8b7deda6c3cff4
83b07fc4b00eaf63844e 4cceb7cc54f0b1f7c9b723ca
83b07fc4b00eaf63844e 4d91d84e270e6ea897bb5bff
83b07fc4b00eaf63844e 4d2a6294c406721edc8683b6
83b07fc4b00eaf63844e 4a84a19bf964a5202afd1fe3
83b07fc4b00eaf63844e 4a84a19bf964a5202afd1fe3
83b07fc4b00eaf63844e 4c31092966e40f475d70c48b
83b07fc4b00eaf63844e 4c31092966e40f475d70c48b
83b07fc4b00eaf63844e 4f47a161e4b068e1ce89dfe1
83b07fc4b00eaf63844e 4c804cc374d7b60c48a96cd8
83b07fc4b00eaf63844e 4c804cc374d7b60c48a96cd8
83b07fc4b00eaf63844e 4f831c11e4b0bc14199ba390
83b07fc4b00eaf63844e 4f831c11e4b0bc14199ba390
83b07fc4b00eaf63844e 4c0048a137850f47a410983f
83b07fc4b00eaf63844e 4c0048a137850f47a410983f
83b07fc4b00eaf63844e 4fbd9bc0e4b09f409bd4fa94
83b07fc4b00eaf63844e 4fbd9bc0e4b09f409bd4fa94
83b07fc4b00eaf63844e 40d23080f964a5203e011fe3
83b07fc4b00eaf63844e 40d23080f964a

In [858]:
len(unique_locu),len(unique_forsquare)

(234, 400)

In [859]:
fs_dup = {}
for l in neigh:
    max
    for k in range(neigh[l][0],neigh[l][1]):
        if 

b48da849c54f904013e2 4de5a887d4c09fc98b6e0498
b48da849c54f904013e2 4de5a887d4c09fc98b6e0498
95ad783fd1c65bb8fdbf 459f6987f964a520c0401fe3
95ad783fd1c65bb8fdbf 459f6987f964a520c0401fe3
206c363a5907bfa98ec0 4fa163a4e4b00c5d71cf15c5
206c363a5907bfa98ec0 4fa163a4e4b00c5d71cf15c5
cb95d1e0730222cc3209 4cc5f773c844721edb15f201
cb95d1e0730222cc3209 4cc5f773c844721edb15f201
25ca87e725b930488ed6 4ab26766f964a520146b20e3
25ca87e725b930488ed6 4ab26766f964a520146b20e3
4f9710321455164d1cb4 500a9f95e4b00d76e499d83e
4f9710321455164d1cb4 500a9f95e4b00d76e499d83e
138aed748ccdfed0845d 4b284ea7f964a520519224e3
138aed748ccdfed0845d 4b284ea7f964a520519224e3
5013d96a9633f92f2dbf 4f3257e619836c91c7ce6b37
5013d96a9633f92f2dbf 4f3257e619836c91c7ce6b37
b2affc7b53499fd8ee76 4e4c4f7ebd413c4cc66867f1
b2affc7b53499fd8ee76 4e4c4f7ebd413c4cc66867f1
7b11cde179779d2e814d 4d6a8a5d0a25b60cb4743290
7b11cde179779d2e814d 4d6a8a5d0a25b60cb4743290
1ff1b1c1d5252ca3682a 4b212d63f964a520c63724e3
1ff1b1c1d5252ca3682a 4b212d63f964a

4300146bafa2e5a5032d 4f32968f19836c91c7e6701c
4300146bafa2e5a5032d 4f32968f19836c91c7e6701c
3e1758a6eb76f7bd1a4f 4f668416e4b03c270bba1400
3e1758a6eb76f7bd1a4f 4f668416e4b03c270bba1400
a7b48d9a9bbc9b5fb2dc 5072d219e4b0f7be6ec39008
a7b48d9a9bbc9b5fb2dc 5072d219e4b0f7be6ec39008
9650a6415d1483688f26 4f321bad19836c91c7b6ae9e
9650a6415d1483688f26 4f321bad19836c91c7b6ae9e
030d9755ac45aae8179a 4c8d644cf87e224bb3643d05
030d9755ac45aae8179a 4c8d644cf87e224bb3643d05
d9141695a158bc3412ca 4b9aeeedf964a52086e435e3
d9141695a158bc3412ca 4b9aeeedf964a52086e435e3
7ae31d035d0df5e9a45b 5191641e498ef1f5e007366c
7ae31d035d0df5e9a45b 5191641e498ef1f5e007366c
93efb0483ee2c0925b73 4c0716c68b4520a1f7808697
93efb0483ee2c0925b73 4c0716c68b4520a1f7808697
c583d18591a9c5a89254 521d6349498e4164db05510e
c583d18591a9c5a89254 521d6349498e4164db05510e
fae3da1dc12d3a78de9d 4bde28866c1b9521ac51ad0f
fae3da1dc12d3a78de9d 4bde28866c1b9521ac51ad0f
65960679ec926c525e4d 4f4f787be4b04078851f5fbd
65960679ec926c525e4d 4f4f787be4b04

In [802]:
f = open("temp.csv").readlines()[1:]

In [803]:
for l in f:
    l = l.strip().split(",")
    if l[0] not in unique_locu:
        print(l)

['661d76421653c345fb73', '517074ece4b0b0c87744eff4']
