In [411]:
import json
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import editdistance as ed
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
from sklearn.preprocessing import PolynomialFeatures as Poly
import re
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [126]:
def clean_data(locu_path,fs_path):
        
    #load the data
    foursquare_data = pd.read_json(open(fs_path))
    locu_data = pd.read_json(open(locu_path))

    #drop the columns that have most of the columns same
    foursquare = foursquare_data.drop(['country', 'region', 'locality'], axis=1)
    locu = locu_data.drop(['country', 'region', 'locality'], axis=1)

    def cleanPhone(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('(', '').replace(')', '').replace('-', '').replace(' ', '')

    def cleanWebsite(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('https://', '').replace('http://', '').replace('www.', '')

    def cleanName(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'s', '').replace('#', '').replace('&', '').replace('-', ' ').replace('/', ' ')
            x = x.lower()
            return x

    def cleanAddress(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'', '').replace('#', '').replace('&', '').replace('.', '')\
                .replace('@', '').replace(',', '').replace('-', ' ').replace('/', ' ')
            x = x.lower()
            x = x.replace('street', 'st').replace('avenue', 'ave').replace('boulevard', 'blvd')\
                    .replace('place', 'pl').replace('square', 'sq').replace('plaza', 'plz')
            x = " "+x
            x = x.replace(" zero","0").replace(" one","1").replace(" two","2").replace(" three","3")\
                .replace(" four","4").replace(" five","5").replace(" six","6").replace(" seven","7")\
                .replace(" eight","8").replace(" nine","9")
            x = x.strip()
            return x

    foursquare['phone'] = foursquare['phone'].map(cleanPhone)
    foursquare['website'] = foursquare['website'].map(cleanWebsite)
    foursquare['name'] = foursquare['name'].map(cleanName)
    foursquare['street_address'] = foursquare['street_address'].map(cleanAddress)

    locu['phone'] = locu['phone'].map(cleanPhone)
    locu['website'] = locu['website'].map(cleanWebsite)
    locu['name'] = locu['name'].map(cleanName)
    locu['street_address'] = locu['street_address'].map(cleanAddress)

    f=open("fs_clean.json","w")
    f.write(foursquare.to_json(orient='records'))
    f=open("lc_clean.json","w")
    f.write(locu.to_json(orient='records'))

In [62]:
def binary_search(array, target, key):
    lower = 0
    upper = len(array)
    while lower < upper:
        x = lower + int((upper - lower)/2)
        val = array[x][key]
        if target == val:
            return x
        elif target > val:
            if lower == x:
                return lower
            lower = x
        elif target < val:
            upper = x
    return lower

In [14]:
def get_matches(locu_train_path, foursquare_train_path, matches_train_path, locu_test_path, foursquare_test_path):
    
    clean_data(locu_train_path, foursquare_train_path)

    matches = open(matches_train_path).readlines()[1:]
    locu_match = {}
    for l in matches:
        l = l.strip().split(',')
        locu_match[l[0]] = l[1]

    foursquare_data = json.load(open('fs_clean.json'))
    locu_data = json.load(open('lc_clean.json'))
    foursquare_data = np.array(foursquare_data)
    locu_data = np.array(locu_data)

    fs_id_ind = {}
    lo_id_ind = {}
    for l in locu_data:
        lo_id_ind[l['id']] = l
    fs_phone_ind = {}
    fs_name_ind = {}
    for l in foursquare_data:
        fs_id_ind[l['id']] = l
        if l['phone'] is not None and l['phone']!='':
            fs_phone_ind[l['phone']] = l
        if l['name'].replace(" ","") not in fs_name_ind:
            fs_name_ind[l['name'].replace(" ","")] = set()
        fs_name_ind[l['name'].replace(" ","")].add(l['id'])

    foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
    foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

    tot = 0
    param_lat = 0.0015
    param_lon = 0.0015
    ann = {}
    for index,l in enumerate(locu_data):
        if l['latitude'] is not None and l['longitude'] is not None:
            lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
            lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
            lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
            lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
            setA = set()
            for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
                setA.add(foursquare_sortedx[i]['id'])
            setB = set()
            for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
                setB.add(foursquare_sortedy[i]['id'])
            setC = setA.intersection(setB)
            if l['name'].replace(" ","") in fs_name_ind:
                setC.update(fs_name_ind[l['name'].replace(" ","")])
            if l['phone'] in fs_phone_ind:
                setC.add(fs_phone_ind[l['phone']]['id'])
            ann[index] = setC
            tot+=len(setC)

    x_train = []
    y_train = []
    x_test = []
    y_test = []
    X = []
    y = []
    c = 0
    for l in ann:
        c+=1
        for fid in ann[l]:
            l1 = locu_data[l]
            l2 = fs_id_ind[fid]
            if l1['id'] in locu_match and locu_match[l1['id']]==fid:
                label=1
            else:
                label=0
            
            f_code = 0
            if l1['postal_code']=='' or l2['postal_code']=='' or l1['postal_code']==l2['postal_code']:
                f_code=1
            
            f_phone = 0
            if l1['phone']=='' or l2['phone']=='' or l1['phone']==l2['phone']:
                f_phone=1
                
            f_name = 0
            if l1['name']=='' or l2['name']=='':
                f_name = 0
            else:
                f_name = ed.eval(l1['name'],l2['name'])/(len(l1['name'])+len(l2['name']))
            
            f_add = 0
            if l1['street_address']=='' or l2['street_address']=='':
                f_add = 0
            else:
                f_add = ed.eval(l1['street_address'],l2['street_address'])/(len(l1['street_address'])+len(l2['street_address']))
            
            f_web = 0
            if l1['website']=='' or l2['website']=='':
                f_web = 0
            else:
                f_web = ed.eval(l1['website'],l2['website'])/(len(l1['website'])+len(l2['website']))   
            
            if c<=450:
                x_train.append([f_code,f_phone,f_name,f_add,f_web])
                y_train.append(label)
            else:
                x_test.append([f_code,f_phone,f_name,f_add,f_web])
                y_test.append(label)
            X.append([f_code,f_phone,f_name,f_add,f_web])
            y.append(label)

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    l_max = 0
    mval = 0
    for l in range(10,201,5):
        y_pred = RFC(n_estimators=l).fit(x_train,y_train).predict(x_test)
        print(f1_score(y_pred,y_test),precision_score(y_pred,y_test),recall_score(y_pred,y_test))
        if f1_score(y_pred,y_test) > mval:
            mval= f1_score(y_pred,y_test)
            l_max = l

    rfc = RFC(n_estimators=l_max).fit(X,y)
    y_pred = rfc.predict(X)

    print(f1_score(y_pred,y),precision_score(y_pred,y),recall_score(y_pred,y))

    clean_data(locu_test_path,foursquare_test_path)

    foursquare_data = json.load(open('fs_clean.json'))
    locu_data = json.load(open('lc_clean.json'))
    foursquare_data = np.array(foursquare_data)
    locu_data = np.array(locu_data)

    fs_id_ind = {}
    lo_id_ind = {}
    for l in locu_data:
        lo_id_ind[l['id']] = l
    fs_phone_ind = {}
    fs_name_ind = {}
    for l in foursquare_data:
        fs_id_ind[l['id']] = l
        if l['phone'] is not None and l['phone']!='':
            fs_phone_ind[l['phone']] = l
        if l['name'].replace(" ","") not in fs_name_ind:
            fs_name_ind[l['name'].replace(" ","")] = set()
        fs_name_ind[l['name'].replace(" ","")].add(l['id'])

    foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
    foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

    tot = 0
    ann = {}
    for index,l in enumerate(locu_data):
        if l['latitude'] is not None and l['longitude'] is not None:
            lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
            lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
            lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
            lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
            setA = set()
            for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
                setA.add(foursquare_sortedx[i]['id'])
            setB = set()
            for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
                setB.add(foursquare_sortedy[i]['id'])
            setC = setA.intersection(setB)
            if l['name'].replace(" ","") in fs_name_ind:
                setC.update(fs_name_ind[l['name'].replace(" ","")])
            if l['phone'] in fs_phone_ind:
                setC.add(fs_phone_ind[l['phone']]['id'])
            ann[index] = setC
            tot+=len(setC)
    print(tot)

    X = []
    c = 0
    pairs = []
    for l in ann:
        c+=1
        for fid in ann[l]:
            l1 = locu_data[l]
            l2 = fs_id_ind[fid]

            pairs.append((l1['id'],l2['id']))
            
            f_code = 0
            if l1['postal_code']=='' or l2['postal_code']=='' or l1['postal_code']==l2['postal_code']:
                f_code=1
            
            f_phone = 0
            if l1['phone']=='' or l2['phone']=='' or l1['phone']==l2['phone']:
                f_phone=1
                
            f_name = 0
            if l1['name']=='' or l2['name']=='':
                f_name = 0
            else:
                f_name = ed.eval(l1['name'],l2['name'])/(len(l1['name'])+len(l2['name']))
            
            f_add = 0
            if l1['street_address']=='' or l2['street_address']=='':
                f_add = 0
            else:
                f_add = ed.eval(l1['street_address'],l2['street_address'])/(len(l1['street_address'])+len(l2['street_address']))
            
            f_web = 0
            if l1['website']=='' or l2['website']=='':
                f_web = 0
            else:
                f_web = ed.eval(l1['website'],l2['website'])/(len(l1['website'])+len(l2['website']))

            X.append([f_code,f_phone,f_name,f_add,f_web])

    y_pred = rfc.predict(X)


    f = open('matches_test.csv','w')
    f.write("locu_id,foursquare_id\n")
    for l in range(len(y_pred)):
        if y_pred[l]==1:
            l = pairs[l]
            f.write(str(l[0])+','+str(l[1])+'\n')
    f.close()



In [357]:
get_matches("train/locu_train.json","train/foursquare_train.json","train/matches_train.csv",
            "test/locu_test.json","test/foursquare_test.json")

TypeError: clean_data() missing 1 required positional argument: 'lc'

In [1507]:
def clean_data(path,fs,lc):
    
    #load the data
    foursquare_data = pd.read_json(open(path+"/"+fs+'.json'))
    locu_data = pd.read_json(open(path+"/"+lc+'.json'))

    #drop the columns that have most of the columns same
    foursquare = foursquare_data.drop(['country', 'region', 'locality'], axis=1)
    locu = locu_data.drop(['country', 'region', 'locality'], axis=1)

    def cleanPhone(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('(', '').replace(')', '').replace('-', '').replace(' ', '')

    def cleanWebsite(x):
        if x is None or x is ',':
            return ''
        else:
            return x.replace('https://', '').replace('http://', '').replace('www.', '')

    def cleanName(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'', '').replace('#', '').replace('&', '').replace('-', ' ').replace('/', ' ')\
                .replace(":","").replace("@","")
            x = x.lower()
            x = " "+x
            x = x.replace(" zero","0").replace(" one","1").replace(" two","2").replace(" three","3")\
                .replace(" four","4").replace(" five","5").replace(" six","6").replace(" seven","7")\
                .replace(" eight","8").replace(" nine","9")
            x = x.strip()
            return x

    def cleanAddress(x):
        if x is None or x is ',':
            return ''
        else:
            x = x.replace('\'', '').replace('#', '').replace('&', '').replace('.', '').replace('@', '').\
            replace(',', '').replace('-', ' ').replace('/', ' ').replace(":","")
            x = x.lower()
            x = x.replace('street', 'st').replace('avenue', 'ave').replace('boulevard', 'blvd')\
                 .replace('place', 'pl').replace('square', 'sq').replace('plaza', 'plz')
            x = " "+x
            x = x.replace(" zero","0").replace(" one","1").replace(" two","2").replace(" three","3")\
                .replace(" four","4").replace(" five","5").replace(" six","6").replace(" seven","7")\
                .replace(" eight","8").replace(" nine","9")
            x = x.strip()
            return x

    foursquare['phone'] = foursquare['phone'].map(cleanPhone)
    foursquare['website'] = foursquare['website'].map(cleanWebsite)
    foursquare['name'] = foursquare['name'].map(cleanName)
    foursquare['street_address'] = foursquare['street_address'].map(cleanAddress)

    locu['phone'] = locu['phone'].map(cleanPhone)
    locu['website'] = locu['website'].map(cleanWebsite)
    locu['name'] = locu['name'].map(cleanName)
    locu['street_address'] = locu['street_address'].map(cleanAddress)

    f=open("fs_clean.json","w")
    f.write(foursquare.to_json(orient='records'))
    f=open("lc_clean.json","w")
    f.write(locu.to_json(orient='records'))

In [1508]:
def binary_search(array, target, key):
    lower = 0
    upper = len(array)
    while lower < upper:
        x = lower + int((upper - lower)/2)
        val = array[x][key]
        if target == val:
            return x
        elif target > val:
            if lower == x:
                return lower
            lower = x
        elif target < val:
            upper = x
    return lower

In [1509]:
clean_data("train","foursquare_train","locu_train")

In [1510]:
matches = open('train/matches_train.csv').readlines()[1:]
locu_match = {}
ch = {}
for l in matches:
    l = l.strip().split(',')
    locu_match[l[0]] = l[1]

In [1511]:
foursquare_data = json.load(open('fs_clean.json'))
locu_data = json.load(open('lc_clean.json'))
foursquare_data = np.array(foursquare_data)
locu_data = np.array(locu_data)

fs_id_ind = {}
lo_id_ind = {}
for l in locu_data:
    lo_id_ind[l['id']] = l
fs_phone_ind = {}
fs_name_ind = {}
for l in foursquare_data:
    fs_id_ind[l['id']] = l
    if l['phone'] is not None and l['phone']!='':
        fs_phone_ind[l['phone']] = l
    if l['name'].replace(" ","") not in fs_name_ind:
        fs_name_ind[l['name'].replace(" ","")] = set()
    fs_name_ind[l['name'].replace(" ","")].add(l['id'])

foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

param_lat = 0.002
param_lon = 0.002
tot = 0
c=0
ann = {}
for index,l in enumerate(locu_data):
    if l['latitude'] is not None and l['longitude'] is not None:

        lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
        lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
        lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
        lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
        setA = set()
        for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
            setA.add(foursquare_sortedx[i]['id'])
        setB = set()
        for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
            setB.add(foursquare_sortedy[i]['id'])
        setC = setA.intersection(setB)
        if l['name'].replace(" ","") in fs_name_ind:
            setC.update(fs_name_ind[l['name'].replace(" ","")])
        if l['phone'] in fs_phone_ind:
            setC.add(fs_phone_ind[l['phone']]['id'])
        if l['id'] in locu_match:
            if locu_match[l['id']] in setC:
                c+=1
            else:
                if l["id"] in ["493f5e2798de851ec3b2","5f3fd107090d0ddc658b","212dffb393f745df801a",
                               "edeba23f215dcc702220","c170270283ef870d546b"]:
                    setC.add(locu_match[l['id']])
                print(lo_id_ind[l['id']],fs_id_ind[locu_match[l['id']]])
                print(lat_l,lat_r,lon_l,lon_r)
                print(lo_id_ind[l['id']]['latitude']-fs_id_ind[locu_match[l['id']]]['latitude'])
                print(lo_id_ind[l['id']]['longitude']-fs_id_ind[locu_match[l['id']]]['longitude'])
                print("---------------------------------------------")
        ann[index] = setC
        tot+=len(setC)
#     else:
#         setC = set()
#         for i in foursquare_data:
#             setC.add(i["id"])
#         ann[index] = setC
#         tot+=len(setC)
#     else:
#         setC = set()
#         if l['name'].replace(" ","") in fs_name_ind:
#             setC.update(fs_name_ind[l['name'].replace(" ","")])
#         if l['phone'] in fs_phone_ind:
#             setC.add(fs_phone_ind[l['phone']]['id'])
#         ann[index] = setC
#         tot+=len(setC)
print(c,tot)

{'id': 'c170270283ef870d546b', 'latitude': 40.766195, 'longitude': -73.977825, 'name': 'exhale spa', 'phone': '2125617400', 'postal_code': '10019', 'street_address': '150 central park south', 'website': 'exhalespa.com/locations/midtown'} {'id': '51eb7eed498e401ec51196b6', 'latitude': 40.75798, 'longitude': -73.98122, 'name': 'halel cart', 'phone': '', 'postal_code': '', 'street_address': '', 'website': ''}
433 457 379 422
0.008214999999999861
0.0033949999999975944
---------------------------------------------
{'id': '5f3fd107090d0ddc658b', 'latitude': 40.713998, 'longitude': -73.996882, 'name': 'tsung sun social club', 'phone': '2122269414', 'postal_code': '10002', 'street_address': '11 division st', 'website': ''} {'id': '51ce011a498ed8dfb15381bb', 'latitude': 40.7726492359, 'longitude': -73.9761875778, 'name': 'spring social running club', 'phone': '', 'postal_code': '', 'street_address': '', 'website': ''}
32 54 119 159
-0.0586512359000011
-0.020694422199994733
---------------------

In [1512]:
from collections import Counter

In [1513]:
def calc_cosine(a,b):
    a = Counter(a)
    b = Counter(b)
    dot = 0
    a2 = 0
    b2 = 0
    for l in a:
        if l in b:
            dot = a[l]*b[l]
    for l in a:
        a2+=a[l]**2
    for l in b:
        b2+=b[l]**2
    return dot/((a2*b2)**0.5)

In [1514]:
def create_features(l1,l2):
    feat = []
    f_code = 0
    if l1['postal_code']=='' or l2['postal_code']=='':
        f_code=0
    elif l1['postal_code']==l2['postal_code']:
        f_code = 1
    feat.append(f_code)

    f_phone = 0
    f_phone_last = 0
    if l1['phone']=='' or l2['phone']=='':
        f_phone=0
        f_phone_last = 0
    else:
        if l1['phone']==l2['phone']:
            f_phone=1
        if l1['phone'][-4:] == l2['phone'][-4:]:
            f_phone_last = 1
    feat.append(f_phone)
    feat.append(f_phone_last)
    
    f_name = 0
    f_name_jaccard = 0
    f_name_exact = 0
    f_name_cosine = 0
    f_name_char = 0
    f_name_match = 0
    f_name_jaccard2 = 0
    if l1['name']=='' or l2['name']=='':
        f_name = 0
        f_name_jaccard = 0
        f_name_exact = 0
        f_name_jaccard2 = 0
        f_name_char = 0
        f_name_cosine = 0
        f_name_match = 0
    else:
        if l1["name"]==l2["name"]:
            f_name_exact = 1
        f_name = ed.eval(l1['name'],l2['name'])/max(len(l1['name']),len(l2['name']))
        setA = set(l1['name'].split(" "))
        setB = set(l2['name'].split(" "))
        f_name_jaccard = len(setA.intersection(setB))/len(setA.union(setB))
        stem_l1 = [stemmer.stem(i) for i in l1["name"].split(" ")]
        stem_l2 = [stemmer.stem(i) for i in l2["name"].split(" ")]
        f_name_match = len(set(stem_l1).intersection(set(stem_l2)))
        f_name_jaccard2 = len(set(stem_l1).intersection(set(stem_l2)))/len(set(stem_l1).union(set(stem_l2)))
        f_name_cosine = calc_cosine(stem_l1,stem_l2)
        t1 = [i for i in l1["name"]]
        t2 = [i for i in l2["name"]]
        f_name_char = calc_cosine(t1,t2)
    feat.append(f_name)
    feat.append(f_name_jaccard)
    feat.append(f_name_jaccard2)
    feat.append(f_name_exact)
    feat.append(f_name_cosine)
    feat.append(f_name_match)
    feat.append(f_name_char)

    f_add = 0
    f_add_num = 0
    f_add_exact = 0
    if l1['street_address']=='' or l2['street_address']=='':
        f_add = 1
        f_add_num = 0
        f_exact = 0
    else:
        if l1['street_address']==l2['street_address']:
            f_add_exact = 1
        f_add = ed.eval(l1['street_address'],l2['street_address'])/max(len(l1['street_address']),len(l2['street_address']))
        l1_num = set(re.findall(r'\d+', l1['street_address']))
        l2_num = set(re.findall(r'\d+', l2['street_address']))
        if len(l1_num.union(l2_num))==0:
            f_add_num = 0
        else:
            f_add_num = len(l1_num.intersection(l2_num))/len(l1_num.union(l2_num))
    feat.append(f_add)
    feat.append(f_add_num)
    feat.append(f_add_exact)

    f_web = 0
    if l1['website']=='' or l2['website']=='':
        f_web = 1
    else:
        f_web = ed.eval(l1['website'],l2['website'])/max(len(l1['website']),len(l2['website']))
    feat.append(f_web)

    f_lat = 1
    f_lon = 1
    if l1['latitude'] is not None and l2['latitude'] is not None:
        f_lat = l1['latitude']-l2['latitude']
        f_lon = l1['longitude']-l2['longitude']
    feat.append(f_lat)
    feat.append(f_lon)
    
    return feat

In [1515]:
x_train = []
y_train = []
x_test = []
y_test = []
X = []
y = []
pairs = []
c = 0
train_sample = 0
for l in ann:
    key = 0
    if locu_data[l]["id"] in ["493f5e2798de851ec3b2","5f3fd107090d0ddc658b","212dffb393f745df801a",
                               "edeba23f215dcc702220","c170270283ef870d546b"]:
        key = 1
    c+=1
    for fid in ann[l]:
        l1 = locu_data[l]
        l2 = fs_id_ind[fid]
        if l1['id'] in locu_match and locu_match[l1['id']]==fid:
            label=1
        else:
            label=0
        
        pairs.append((l1,l2))
        
        feat = create_features(l1,l2)
        
        if c<=450:
            train_sample+=1
        X.append(feat)
        y.append(label)
        if key==1 and c<=450:
            for r in range(20):
                if c<=450:
                    train_sample+=1
                pairs.append((l1,l2))
                X.append(feat)
                y.append(label)

In [1516]:
len(X),len(y)

(2757, 2757)

In [1517]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import f1_score,precision_score, recall_score

In [1518]:
X = np.array(X)
y = np.array(y)

# poly = Poly(interaction_only=True)
# X = poly.fit_transform(X)
x_train = X[:train_sample]
x_test = X[train_sample:]
y_train = y[:train_sample]
y_test = y[train_sample:]

In [1519]:
l_max = 0
mval = 0
d_val = 0
for d in [2,4,6,8]:
    for l in range(10,201,5):
        rfc = RFC(n_estimators=l,max_depth=d).fit(x_train,y_train)
        y_pred = rfc.predict(x_test)
        print(f1_score(y_pred,y_test),precision_score(y_pred,y_test),recall_score(y_pred,y_test),accuracy_score(y_pred,y_test))
        if accuracy_score(y_pred,y_test) > mval:
            mval= accuracy_score(y_pred,y_test)
            l_max = l
            d_val = d
        y_pred = rfc.predict(x_train)

rfc = RFC(n_estimators=l_max,max_depth=d_val).fit(x_train,y_train)
y_pred = rfc.predict(x_test)
rfc = RFC(n_estimators=l_max,max_depth=d_val).fit(X,y)

0.936170212766 0.988764044944 0.888888888889 0.980295566502
0.910052910053 0.966292134831 0.86 0.972085385878
0.925531914894 0.977528089888 0.878787878788 0.977011494253
0.925531914894 0.977528089888 0.878787878788 0.977011494253
0.882051282051 0.966292134831 0.811320754717 0.96223316913
0.887755102041 0.977528089888 0.81308411215 0.963875205255
0.910052910053 0.966292134831 0.86 0.972085385878
0.924731182796 0.966292134831 0.886597938144 0.977011494253
0.924731182796 0.966292134831 0.886597938144 0.977011494253
0.882051282051 0.966292134831 0.811320754717 0.96223316913
0.930481283422 0.977528089888 0.887755102041 0.978653530378
0.919786096257 0.966292134831 0.877551020408 0.975369458128
0.910052910053 0.966292134831 0.86 0.972085385878
0.882051282051 0.966292134831 0.811320754717 0.96223316913
0.910052910053 0.966292134831 0.86 0.972085385878
0.882051282051 0.966292134831 0.811320754717 0.96223316913
0.910052910053 0.966292134831 0.86 0.972085385878
0.915789473684 0.977528089888 0.861

0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.983050847458 0.977528089888 0.988636363636 0.995073891626
0.983050847458 0.977528089888 0.988636363636 0.995073891626
0.983050847458 0.977528089888 0.988636363636 0.995073891626
0.983240223464 0.988764044944 0.977777777778 0.995073891626
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.983050847458 0.977528089888 0.988636363636 0.995073891626
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.983050847458 0.977528089888 0.988636363636 0.995073891626
0.977528089888 0.977528089888 0.977528089888 0.993431855501
0.983240223464 0.988764044944 0.977777777778 0.995073891626
0.983050847458 0.977528089888 0.988636363636 0.995073891626


In [1520]:
c = 0
for l in range(len(y_test)):
    if y_test[l] != y_pred[l]:
        c+=1
        print(pairs[len(y_train)+l],y_test[l],y_pred[l])
print(c)

({'id': '0b6dfce85d74f1b65f52', 'latitude': 40.781951, 'longitude': -73.975479, 'name': 'west 79 street boat basin cafe', 'phone': '2124965542', 'postal_code': '10024', 'street_address': 'w 79 st', 'website': 'boatbasincafe.com/'}, {'id': '40ede000f964a5203c0a1fe3', 'latitude': 40.785675637, 'longitude': -73.9845085144, 'name': 'boat basin café', 'phone': '2124965542', 'postal_code': '10024', 'street_address': '390 w 79th st', 'website': ''}) 0 1
({'id': '493f5e2798de851ec3b2', 'latitude': 40.758207, 'longitude': -73.992323, 'name': 'pick a bagel', 'phone': '2127928008', 'postal_code': '10036', 'street_address': '360 w 42nd st', 'website': 'pickabagel42ndstreetnyc.com'}, {'id': '51f119e7498e9716f71f4413', 'latitude': 40.756361, 'longitude': -73.967311, 'name': 'fresh bagels', 'phone': '', 'postal_code': '', 'street_address': '', 'website': ''}) 1 0
2


In [1521]:
rfc.feature_importances_

array([ 0.0024405 ,  0.00902927,  0.01165329,  0.03078243,  0.31124365,
        0.22245691,  0.03281519,  0.06202546,  0.01675938,  0.01456541,
        0.09965564,  0.03066362,  0.05121992,  0.00255444,  0.0549429 ,
        0.04719202])

In [1522]:
clean_data("test","foursquare_test","locu_test")

In [1523]:
foursquare_data = json.load(open('fs_clean.json'))
locu_data = json.load(open('lc_clean.json'))
foursquare_data = np.array(foursquare_data)
locu_data = np.array(locu_data)

fs_id_ind = {}
lo_id_ind = {}
for l in locu_data:
    lo_id_ind[l['id']] = l
fs_phone_ind = {}
fs_name_ind = {}
for l in foursquare_data:
    fs_id_ind[l['id']] = l
    if l['phone'] is not None and l['phone']!='':
        fs_phone_ind[l['phone']] = l
    if l['name'].replace(" ","") not in fs_name_ind:
        fs_name_ind[l['name'].replace(" ","")] = set()
    fs_name_ind[l['name'].replace(" ","")].add(l['id'])

foursquare_sortedx = sorted(foursquare_data,key=lambda x:x['latitude'])
foursquare_sortedy = sorted(foursquare_data,key=lambda x:x['longitude'])

tot = 0
ann = {}
for index,l in enumerate(locu_data):
    if l['latitude'] is not None and l['longitude'] is not None:
        lat_l = binary_search(foursquare_sortedx,l['latitude']-param_lat,'latitude')
        lat_r = binary_search(foursquare_sortedx,l['latitude']+param_lat,'latitude')
        lon_l = binary_search(foursquare_sortedy,l['longitude']-param_lon,'longitude')
        lon_r = binary_search(foursquare_sortedy,l['longitude']+param_lon,'longitude')
        setA = set()
        for i in range(lat_l,min(lat_r+2,len(foursquare_data))):
            setA.add(foursquare_sortedx[i]['id'])
        setB = set()
        for i in range(lon_l,min(lon_r+2,len(foursquare_data))):
            setB.add(foursquare_sortedy[i]['id'])
        setC = setA.intersection(setB)
        if l['name'].replace(" ","") in fs_name_ind:
            setC.update(fs_name_ind[l['name'].replace(" ","")])
        if l['phone'] in fs_phone_ind:
            setC.add(fs_phone_ind[l['phone']]['id'])
        ann[index] = setC
        tot+=len(setC)
#     else:
#         setC = set()
#         for i in foursquare_data:
#             setC.add(i["id"])
#         ann[index] = setC
#         tot+=len(setC)
#     else:
#         setC = set()
#         if l['name'].replace(" ","") in fs_name_ind:
#             setC.update(fs_name_ind[l['name'].replace(" ","")])
#         if l['phone'] in fs_phone_ind:
#             setC.add(fs_phone_ind[l['phone']]['id'])
#         ann[index] = setC
#         tot+=len(setC)
print(tot)

1330


In [1524]:
X = []
c = 0
pairs = []
for l in ann:
    c+=1
    for fid in ann[l]:
        l1 = locu_data[l]
        l2 = fs_id_ind[fid]

        pairs.append((l1['id'],l2['id']))
        
        feat = create_features(l1,l2)

        X.append(feat)
len(X)

1330

In [1525]:
# X = poly.transform(X)
y_pred = rfc.predict(X)

In [1526]:
sum(y_pred)

232

In [1527]:
len(locu_data)

400

In [1528]:
y_pred_prob = rfc.predict_proba(X)

In [1529]:
y_pred_2 = np.array(y_pred)
y_pred = []
for l in y_pred_prob:
    if l[1]>0.3:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [1530]:
for l in range(len(y_pred)):
    if y_pred[l]==1:
        print(y_pred_prob[l], pairs[l])

[ 0.  1.] ('b48da849c54f904013e2', '4de5a887d4c09fc98b6e0498')
[ 0.02162406  0.97837594] ('95ad783fd1c65bb8fdbf', '459f6987f964a520c0401fe3')
[ 0.21213424  0.78786576] ('206c363a5907bfa98ec0', '4fa163a4e4b00c5d71cf15c5')
[ 0.  1.] ('cb95d1e0730222cc3209', '4cc5f773c844721edb15f201')
[ 0.13333333  0.86666667] ('ecdc736a7d663a46e01d', '4b4ce0d1f964a520b9c226e3')
[ 0.52138957  0.47861043] ('25ca87e725b930488ed6', '4ab26766f964a520146b20e3')
[ 0.02162406  0.97837594] ('4f9710321455164d1cb4', '500a9f95e4b00d76e499d83e')
[ 0.  1.] ('138aed748ccdfed0845d', '4b284ea7f964a520519224e3')
[ 0.  1.] ('5013d96a9633f92f2dbf', '4f3257e619836c91c7ce6b37')
[ 0.  1.] ('b2affc7b53499fd8ee76', '4e4c4f7ebd413c4cc66867f1')
[ 0.16072516  0.83927484] ('7b11cde179779d2e814d', '4d6a8a5d0a25b60cb4743290')
[ 0.  1.] ('1ff1b1c1d5252ca3682a', '4b212d63f964a520c63724e3')
[ 0.  1.] ('417df85b1156d93935b3', '5103351ee4b0ad7f2e01d882')
[ 0.  1.] ('be74338f204c673792a9', '519b78372fc6182159f7eb0d')
[ 0.  1.] ('76b75b0a67

In [1536]:
unique_locu = {}
for l in range(len(y_pred)):
    if y_pred[l]==1:
        ind = l
        l = pairs[l]
        if l[0] not in unique_locu:
            unique_locu[l[0]] = (l[1],y_pred_prob[ind][1])
        elif unique_locu[l[0]][1]<y_pred_prob[ind][1]:
            unique_locu[l[0]] = (l[1],y_pred_prob[ind][1])

In [1537]:
unique_forsquare = {}
for l in unique_locu:
    if unique_locu[l][0] not in unique_forsquare:
        unique_forsquare[unique_locu[l][0]] = (l,unique_locu[l][1])
    elif unique_forsquare[unique_locu[l][0]][1] < unique_locu[l][1]:
        unique_forsquare[unique_locu[l][0]] = (l,unique_locu[l][1])

In [1538]:
f = open('matches_test.csv','w')
f.write("locu_id,foursquare_id\n")
for l in unique_forsquare:
    f.write(str(unique_forsquare[l][0])+','+str(l)+'\n')
f.close()

In [1532]:
len(unique_locu),len(unique_forsquare)

(236, 241)

In [1533]:
f = open("temp.csv").readlines()[1:]

In [1534]:
temp = {}
for l in f:
    l = l.strip().split(",")
    if l[0] not in unique_locu:
        print(l)
    temp[l[0]] = l[1]